In [1]:
import simulate
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
#import pypots
from pypots.data import load_specific_dataset, mcar, masked_fill
from pypots.imputation import SAITS
from pypots.utils.metrics import cal_mae

  from .autonotebook import tqdm as notebook_tqdm


# UCI HAR with MCAR, MAR, and MNAR

In [2]:
train_directory = "D:/GitHub/ece209as_project/data/UCI_HAR_Dataset/train/Inertial_Signals/"

train_files = ["body_acc_x_train.txt", "body_acc_y_train.txt", "body_acc_z_train.txt", 
        "body_gyro_x_train.txt", "body_gyro_y_train.txt", "body_gyro_z_train.txt", 
        "body_acc_x_train.txt", "body_acc_y_train.txt", "body_acc_z_train.txt"]

train_df = []
train_y = pd.read_csv("D:/GitHub/ece209as_project/data/UCI_HAR_Dataset/train/y_train.txt", header=None)

test_directory = "D:/GitHub/ece209as_project/data/UCI_HAR_Dataset/test/Inertial_Signals/"

test_files = ["body_acc_x_test.txt", "body_acc_y_test.txt", "body_acc_z_test.txt", 
        "body_gyro_x_test.txt", "body_gyro_y_test.txt", "body_gyro_z_test.txt", 
        "body_acc_x_test.txt", "body_acc_y_test.txt", "body_acc_z_test.txt"]

test_df = []
test_y = pd.read_csv("D:/GitHub/ece209as_project/data/UCI_HAR_Dataset/test/y_test.txt", header=None)


for train_file in train_files:
    df = pd.read_csv(train_directory+train_file, delim_whitespace=True, header=None)
    train_df.append(np.array(df))

for test_file in test_files:
    df = pd.read_csv(test_directory+test_file, delim_whitespace=True, header=None)
    test_df.append(np.array(df))

train_X = np.array(train_df).reshape([len(train_y), 128, 9])
test_X = np.array(test_df).reshape([len(test_y), 128, 9])

In [3]:
# MCAR
dict_MCAR = simulate.simulate_nan(train_X, 0.1, "MCAR")
X_intact_MCAR = dict_MCAR['X_init']
X_missing_MCAR = dict_MCAR['X_incomp']
X_mask_MCAR = dict_MCAR['mask']

saits = SAITS(n_steps=128, n_features=9, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X_missing_MCAR)
imputation = saits.impute(X_missing_MCAR)
mae = cal_mae(imputation, X_intact_MCAR, X_mask_MCAR)

Model initialized successfully. Number of the trainable parameters: 1332038
epoch 0: training loss 0.2365
epoch 1: training loss 0.1352
epoch 2: training loss 0.1167
epoch 3: training loss 0.1089
epoch 4: training loss 0.1039
epoch 5: training loss 0.1005
epoch 6: training loss 0.0979
epoch 7: training loss 0.0958
epoch 8: training loss 0.0942
epoch 9: training loss 0.0929
Finished training.


In [4]:
# MAR - logistic
dict_MAR = simulate.simulate_nan(train_X, 0.1, "MAR")
X_intact_MAR = dict_MAR['X_init']
X_missing_MAR = dict_MAR['X_incomp']
X_mask_MAR = dict_MAR['mask']

saits = SAITS(n_steps=128, n_features=9, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X_missing_MAR)
imputation = saits.impute(X_missing_MAR)
mae = cal_mae(imputation, X_intact_MAR, X_mask_MAR)

Model initialized successfully. Number of the trainable parameters: 1332038
epoch 0: training loss 0.1945
epoch 1: training loss 0.1135
epoch 2: training loss 0.0970
epoch 3: training loss 0.0855
epoch 4: training loss 0.0774
epoch 5: training loss 0.0684
epoch 6: training loss 0.0632
epoch 7: training loss 0.0585
epoch 8: training loss 0.0561
epoch 9: training loss 0.0552
Finished training.


In [5]:
# MNAR - logistic
dict_MAR = simulate.simulate_nan(train_X, 0.1, "MNAR", opt="logistic")
X_intact_MNAR = dict_MAR['X_init']
X_missing_MNAR = dict_MAR['X_incomp']
X_mask_MNAR = dict_MAR['mask']

saits = SAITS(n_steps=128, n_features=9, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X_missing_MNAR)
imputation = saits.impute(X_missing_MNAR)
mae = cal_mae(imputation, X_intact_MNAR, X_mask_MNAR)

Model initialized successfully. Number of the trainable parameters: 1332038
epoch 0: training loss 0.2070
epoch 1: training loss 0.1247
epoch 2: training loss 0.1092
epoch 3: training loss 0.1014
epoch 4: training loss 0.0963
epoch 5: training loss 0.0927
epoch 6: training loss 0.0900
epoch 7: training loss 0.0883
epoch 8: training loss 0.0868
epoch 9: training loss 0.0854
Finished training.


#  PAMAPS2 with MCAR, MAR, and MNAR

In [6]:
train_directory = "D:/GitHub/ece209as_project/data/PAMAP2_Dataset/Protocol"

train_files = "/subject101.dat"

train_df = pd.read_csv(train_directory+train_files, delim_whitespace=True, header=None)

In [7]:
train_Y = np.array(train_df[1])
train_X = np.array(train_df.drop(columns=1))

In [8]:
def createWindows(train_X, train_Y, window_size):
    X = []
    Y = []
    i= 0

    while i<len(train_X):
        count = 0
        j = i
        while j<min(i+window_size, len(train_X)):
            if(train_Y[int(j)] == train_Y[int(i)]):
                count+=1
            else:
                break
            j=j+1
        if(count == window_size):
            X.append(train_X[int(i):int(i+window_size)])
            Y.append(train_Y[int(i)])
            i+=(window_size/2)
        else:
            i=i+1

    X = np.array(X)
    Y = np.array(Y)
    return X, Y

In [9]:
window_size = 40
X_40, Y_40 = createWindows(train_X, train_Y, window_size)

In [11]:
# MCAR
dict_MCAR = simulate.simulate_nan(X_40, 0.1, "MCAR")
X_intact_MCAR = dict_MCAR['X_init']
X_missing_MCAR = dict_MCAR['X_incomp']
X_mask_MCAR = dict_MCAR['mask']


saits = SAITS(n_steps=40, n_features=53, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X_missing_MCAR)
imputation = saits.impute(X_missing_MCAR)
mae = cal_mae(imputation, X_intact_MCAR, X_mask_MCAR)

Model initialized successfully. Number of the trainable parameters: 1406222
epoch 0: training loss 22.2347
epoch 1: training loss 7.7339
epoch 2: training loss 6.2756
epoch 3: training loss 5.5214
epoch 4: training loss 5.1984
epoch 5: training loss 4.8768
epoch 6: training loss 4.5065
epoch 7: training loss 4.3225
epoch 8: training loss 4.1898
epoch 9: training loss 4.1414
Finished training.


In [12]:
# MAR - logistic
dict_MAR = simulate.simulate_nan(X_40, 0.1, "MAR")
X_intact_MAR = dict_MAR['X_init']
X_missing_MAR = dict_MAR['X_incomp']
X_mask_MAR = dict_MAR['mask']

saits = SAITS(n_steps=40, n_features=53, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X_missing_MAR)
imputation = saits.impute(X_missing_MAR)
mae = cal_mae(imputation, X_intact_MAR, X_mask_MAR)

Model initialized successfully. Number of the trainable parameters: 1406222
epoch 0: training loss 21.1689
epoch 1: training loss 7.4924
epoch 2: training loss 6.1301
epoch 3: training loss 5.5073
epoch 4: training loss 5.0382
epoch 5: training loss 4.7381
epoch 6: training loss 4.5506
epoch 7: training loss 4.2798
epoch 8: training loss 4.1497
epoch 9: training loss 4.0989
Finished training.


In [13]:
# MNAR - logistic
dict_MAR = simulate.simulate_nan(X_40, 0.1, "MNAR", opt="logistic")
X_intact_MNAR = dict_MAR['X_init']
X_missing_MNAR = dict_MAR['X_incomp']
X_mask_MNAR = dict_MAR['mask']

saits = SAITS(n_steps=40, n_features=53, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X_missing_MNAR)
imputation = saits.impute(X_missing_MNAR)
mae = cal_mae(imputation, X_intact_MNAR, X_mask_MNAR)

Model initialized successfully. Number of the trainable parameters: 1406222
epoch 0: training loss 21.1177
epoch 1: training loss 7.4697
epoch 2: training loss 6.0145
epoch 3: training loss 5.3788
epoch 4: training loss 4.9042
epoch 5: training loss 4.6789
epoch 6: training loss 4.4280
epoch 7: training loss 4.2274
epoch 8: training loss 4.1101
epoch 9: training loss 3.9475
Finished training.
