In [2]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import sqlalchemy
from sqlalchemy import create_engine
import s3fs
from tqdm import tqdm

from functools import partial
import ray

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# Data Processing Functions
def load_data_to_df():
    location = f"postgresql://postgres:{os.environ.get('db_password')}@{os.environ.get('db_location')}"
    engine = create_engine(location)
    conn = engine.connect()
    columns = pd.read_sql("select column_name from information_schema.columns where table_schema = 'public' and table_name = 'tb_final_dataset'", conn)
    columns = list(raw_df['column_name'])
    select_cols = [c for c in columns if not "ohe" in c]
    s= ", "
    string = s.join(select_cols)
    raw_df = pd.read_sql(f"select {string} from public.vw_final_dataset", conn)
    return raw_df

def clean_data(df):
    # Drop rows with no Y value
    df = df.dropna(subset='bg')
    
    # Drop OHE columns
    df = df.loc[:, ~df.columns.str.contains("ohe")]
    
    # Sort by timestamp
    df = df.sort_values(by="timestamp_clean")
    
    # Set index to time_stamp_clean
    df.index = df['timestamp_clean']
    df = df.drop(labels=['timestamp_clean'], axis=1)
    
    # Drop first row by subject which has data quality issues
    df = df[df.groupby('subjectid').cumcount() > 0] 
    
    # Drop columns that are indices, irrelevant, or capture in OHE variables
    drop_cols = ['subjectid', 'entryid', 'timestamp', 'date', 'time']
    df = df.drop(labels=drop_cols, axis=1)
    
    # Drop null week days (In need of better solution)
    df = df.loc[~df['weekday'].isna(), :]
    
    # Fill nulls (lag BG values) with 0 to indicate data is unavailable
    print(f"Null values to be filled by column:")
    nulls = df.isna().sum()
    null_idx = list(nulls.index)
    vals = list(nulls)
    for col, val in list(zip(null_idx, vals)):
        if val > 0:
            print(col,val)
    df = df.fillna(0)
    
    # One hot Encode Weekdays
    weekdays = np.unique(df['weekday'])
    ohe_weekdays = [f"ohe_{day}" for day in weekdays]
    df[ohe_weekdays] = pd.get_dummies(df.weekday)
    df = df.drop(labels="weekday", axis=1)
    
    return df

def split_and_scale(df, scaler=None):
    # train/val/test split
    train_df = df.loc[df['train_set'] ==1, :]
    val_df = df.loc[df['validation_set'] ==1, :]
    test_df = df.loc[df['test_set'] == 1, :] 
    
    # Extract y vars
    train_y = train_df['bg']
    val_y = val_df['bg']
    test_y = test_df['bg']
    
    # Drop non-X columns
    drop_cols = ['train_set', 'validation_set', 'test_set', 'bg']
    train_df = train_df.drop(labels=drop_cols, axis=1)
    val_df = val_df.drop(labels=drop_cols, axis=1)
    test_df = test_df.drop(labels=drop_cols, axis=1)
    
    # Select Scaling columns (i.e. don't scale one hot encoded variables)
    ohe_cols = train_df.columns[train_df.columns.str.contains('ohe')]
    scaling_cols = train_df.columns.difference(ohe_cols)
    print(f"{len(ohe_cols)} one hot encoded columns ")
    print(f"{len(scaling_cols)} scaled columns")
    
    # Fit Scaler if one isn't provided 
    scaler = MinMaxScaler()
    scaler.fit(train_df[scaling_cols])
    
    # Perform Scaling 
    train_array = scaler.transform(train_df[scaling_cols])
    val_array = scaler.transform(val_df[scaling_cols])
    test_array = scaler.transform(test_df[scaling_cols])
    
    # Recombine Scaled Data into DataFrame Format 
    train_scaled_df = pd.DataFrame(train_array, columns=scaling_cols, index=train_df.index)
    val_scaled_df = pd.DataFrame(val_array, columns=scaling_cols, index=val_df.index)
    test_scaled_df = pd.DataFrame(test_array, columns=scaling_cols, index=test_df.index)
    
    train_df = pd.concat([train_scaled_df, train_df.loc[:,ohe_cols], train_y], axis=1)
    val_df = pd.concat([val_scaled_df, val_df.loc[:,ohe_cols], val_y], axis=1)
    test_df = pd.concat([test_scaled_df, test_df.loc[:,ohe_cols], test_y], axis=1)
    
    return train_df, val_df, test_df, scaler

def split_and_scale_holdouts(df, scaler):
    test_y = df['bg']
    drop_cols = ['train_set', 'validation_set', 'test_set', 'bg']
    X = df.drop(drop_cols, axis=1)
    
    # Select Scaling columns (i.e. don't scale one hot encoded variables)
    ohe_cols = X.columns[X.columns.str.contains('ohe')]
    scaling_cols = X.columns.difference(ohe_cols)
    print(f"{len(ohe_cols)} one hot encoded columns ")
    print(f"{len(scaling_cols)} scaled columns")
    
    X_array = scaler.transform(X[scaling_cols])
    
    X_scaled = pd.DataFrame(X_array, columns=scaling_cols, index=X.index)
    test_df = pd.concat([X_scaled, X.loc[:,ohe_cols], test_y], axis=1)
    return test_df

def df_to_Xy_tensors(df, window_size=12):
    X = []
    y = []
    num_features = len(df.columns) - 1
    for idx in tqdm(range(window_size, len(df)-window_size)):
        window_df = df.iloc[idx-window_size:idx]
        X.append(window_df.loc[:, df.columns != 'bg'].values)
        # The first element is the y value associated with the sequence of X values 
        y.append(window_df['bg'].iloc[0])
        
    X_tensor = torch.cat([torch.tensor(i).float() for i in X]).view(len(X), window_size, num_features)
    y_tensor = torch.tensor(y).float()
    return X_tensor, y_tensor

In [4]:
load_dotenv()

True

In [21]:
held_out_subjects = [60844515, 41131654, 40997757, 94200862, 91161972, 28608066,
                     76817975, 37875431, 63047517, 72492570, 80796147, 87770486,
                     95851255, 70454270]

In [25]:
# Configure data
# Load data and remove holdout subjects
start = datetime.now()
raw_df = load_data_to_df()
print(f"Data loaded in {datetime.now() - start}")
holdout_subjects = raw_df.loc[raw_df['subjectid'].isin(held_out_subjects)]
train_subjects = raw_df.loc[~raw_df['subjectid'].isin(held_out_subjects)]

Data loaded in 0:00:00.000039


In [27]:
# Clean In sample Data
clean_insample_df = clean_data(train_subjects)
# Split and Scale Data
train_df, val_df, test_df, scaler = split_and_scale(clean_insample_df)

Null values to be filled by column:
bg_lag_1 32768
bg_lag_2 33941
bg_lag_3 34198
bg_lag_4 34729
bg_lag_5 35258
bg_lag_6 35452
bg_lag_7 35867
bg_lag_8 35992
bg_lag_9 36207
bg_lag_10 36429
bg_lag_11 36641
bg_lag_12 36773
7 one hot encoded columns 
23 scaled columns


NameError: name 'clean_holdout_df' is not defined

In [28]:
clean_holdout_df = clean_data(holdout_subjects)
holdout_scaled = split_and_scale_holdouts(clean_holdout_df, scaler)

Null values to be filled by column:
bg_lag_1 8547
bg_lag_2 8022
bg_lag_3 7857
bg_lag_4 8150
bg_lag_5 8218
bg_lag_6 8163
bg_lag_7 8230
bg_lag_8 8394
bg_lag_9 8295
bg_lag_10 8384
bg_lag_11 8460
bg_lag_12 8495
7 one hot encoded columns 
23 scaled columns


In [29]:
import gc
gc.collect()

997

In [30]:
print("Creating Train Tensors")
train_X, train_y = df_to_Xy_tensors(train_df, window_size=12)

Creating Train Tensors


100%|█████████████████████████████████████████████████████████████████████████████████████| 2822590/2822590 [21:39<00:00, 2172.30it/s]


In [34]:
s3 = s3fs.S3FileSystem()
with s3.open("s3://bgpredict/models/lstm/tensors/train_X.pt", 'wb') as f:
    torch.save(train_X, f)
with s3.open("s3://bgpredict/models/lstm/tensors/train_y.pt", 'wb') as f:
    torch.save(train_y, f)

In [31]:
with open("./train_X.pt", 'wb') as f:
    torch.save(train_X, f)
with open("./train_y.pt", 'wb') as f:
    torch.save(train_y, f)

In [26]:
del train_X
del train_y

NameError: name 'train_X' is not defined

In [35]:
print("Creating Validation Tensors")
val_X, val_y = df_to_Xy_tensors(val_df, window_size=window_size)

Creating Validation Tensors


100%|█████████████████████████████████████████████████████████████████████████████████████| 1346613/1346613 [10:09<00:00, 2209.42it/s]


In [36]:
s3 = s3fs.S3FileSystem()
with s3.open("s3://bgpredict/models/lstm/tensors/val_X.pt", 'wb') as f:
    torch.save(val_X, f)
with s3.open("s3://bgpredict/models/lstm/tensors/val_y.pt", 'wb') as f:
    torch.save(val_y, f)

In [37]:
with open("./val_X.pt", 'wb') as f:
    torch.save(val_X, f)
with open("./val_y.pt", 'wb') as f:
    torch.save(val_y, f)

In [38]:
print("Creating Test Tensors")
test_X, test_y = df_to_Xy_tensors(test_df, window_size=window_size)

Creating Test Tensors


100%|█████████████████████████████████████████████████████████████████████████████████████| 1171256/1171256 [08:46<00:00, 2226.30it/s]


In [39]:
s3 = s3fs.S3FileSystem()
with s3.open("s3://bgpredict/models/lstm/tensors/test_X.pt", 'wb') as f:
    torch.save(test_X, f)
with s3.open("s3://bgpredict/models/lstm/tensors/test_y.pt", 'wb') as f:
    torch.save(test_y, f)

In [40]:
with open("./test_X.pt", 'wb') as f:
    torch.save(test_X, f)
with open("./test_y.pt", 'wb') as f:
    torch.save(test_y, f)

In [41]:
print("Creating Holdout Tensors")
holdout_X, holdout_y = df_to_Xy_tensors(holdout_scaled)

Creating Holdout Tensors


100%|███████████████████████████████████████████████████████████████████████████████████████| 572731/572731 [04:20<00:00, 2202.44it/s]


In [42]:
s3 = s3fs.S3FileSystem()
with s3.open("s3://bgpredict/models/lstm/tensors/holdout_X.pt", 'wb') as f:
    torch.save(holdout_X, f)
with s3.open("s3://bgpredict/models/lstm/tensors/holdout_y.pt", 'wb') as f:
    torch.save(holdout_y, f)

In [43]:
with open("./holdout_X.pt", 'wb') as f:
    torch.save(holdout_X, f)
with open("./holdout_y.pt", 'wb') as f:
    torch.save(holdout_y, f)