In [40]:
import os

directory = '../processed_data/no_miss'
datasets = os.listdir(directory)
datasets_MCAR = [dataset for dataset in datasets if dataset[-7] == 'C']
datasets_MNAR = [dataset for dataset in datasets if dataset[-7] == 'N']

directory = '../processed_data/yes_miss'
datasets = os.listdir(directory)

print('MCAR datasets:', datasets_MCAR)
print('MNAR datasets:', datasets_MNAR)
print('Datasets with missing values:', datasets)

MCAR datasets: ['airfoil_MCAR.csv', 'christine_MCAR.csv', 'philippine_MCAR.csv', 'phoneme_MCAR.csv', 'wine_quality_MCAR.csv']
MNAR datasets: ['airfoil_MNAR.csv', 'christine_MNAR.csv', 'philippine_MNAR.csv', 'phoneme_MNAR.csv', 'wine_quality_MNAR.csv']
Datasets with missing values: ['cirrhosis.csv', 'equity.csv', 'fico.csv', 'support.csv', 'wiki.csv']


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def prepareData(dataset_name, SEED):
    
    if 'MCAR' in dataset_name or 'MNAR' in dataset_name:
        data = pd.read_csv(f'../processed_data/no_miss/{dataset_name}')
    else:
        data = pd.read_csv(f'../processed_data/yes_miss/{dataset_name}')
    
    y = data.y.values
    X = data.drop('y', axis=1).values
    n, dim = X.shape

    if len(data.y.unique()) == 2:
        regression = False
    else:
        regression = True

    # Split the data into training, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=SEED, stratify=y if not regression else None)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                      random_state=SEED, stratify=y_train if not regression else None)
    
    
    # Normalize and scale according to the training set
    eps = 1e-6
    mean = np.nanmean(X_train, axis=0)
    std = np.nanstd(X_train, axis=0) + eps
    X_train = (X_train - mean) / std
    X_val = (X_val - mean) / std
    X_test = (X_test - mean) / std
    
    return X_train, X_val, X_test, y_train, y_val, y_test, dim, regression

import torch
from torch.utils.data import DataLoader, TensorDataset

def data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test):
    train_batch = int(2**np.ceil(np.log2(X_train.shape[0]//10)))
    X_train, X_val, X_test = [torch.tensor(x, dtype=torch.float32) for x in [X_train, X_val, X_test]]
    y_train, y_val, y_test = [torch.tensor(y, dtype=torch.float32) for y in [y_train, y_val, y_test]]
    
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=train_batch, shuffle=True)
    val_loader, test_loader = [DataLoader(TensorDataset(X, y), batch_size=X.shape[0], shuffle=False) for X, y in [(X_val, y_val), (X_test, y_test)]]
    return train_loader, val_loader, test_loader

from train_utils import getPredictions
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def appendResults(model, test_loader, device, training_time, regression_flag):
    y_pred, y_test = getPredictions(model, test_loader, device)

    if regression_flag:
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        return [training_time, mse, mae, r2, np.nan, np.nan, np.nan, np.nan, np.nan]

    else:
        y_pred, y_test = getPredictions(model, test_loader, device)
        y_pred = ((torch.sigmoid(torch.tensor(y_pred)) >= 0.5) * 1.).cpu().numpy()
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        return [training_time, np.nan, np.nan, np.nan, acc, prec, rec, f1, roc_auc]

# MCAR

In [41]:
# Time
import time

# Sklearn Metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# MLP
from models import CustomDropoutModel, CustomNeuMissMLP
from train_utils import train, getPredictions

# Evaluation
metrics = ['time', 'mse', 'mae', 'r2', 'acc', 'prec', 'rec', 'f1', 'roc_auc']

# Experiment
seeds = [int(bin(i)[2:]) for i in list(range(5))]
results_dropout = np.zeros((len(datasets_MCAR), len(seeds), len(metrics)))
results_neumiss = np.zeros((len(datasets_MCAR), len(seeds), len(metrics)))

for did, mcar_dataset in enumerate(datasets_MCAR):
    for sid, seed in enumerate(seeds):
        # Prepare data
        X_train, X_val, X_test, y_train, y_val, y_test, dim, regression = prepareData(mcar_dataset, seed)
        train_loader, val_loader, test_loader = data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test)
        
        if regression:
            # Fit Custom Dropout Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (Custom Dropout - Regression)\n')
            start_time = time.time()
            dropout_model = CustomDropoutModel(input_layer=dim, 
                                               na_layers=[2**int(np.log2(dim)+1)], 
                                               model_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=dropout_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_dropout[did, sid] = appendResults(dropout_model, test_loader, device, training_time, regression)

            # Fit NeuMiss Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (NeuMiss - Regression)\n')
            start_time = time.time()
            neumiss_model = CustomNeuMissMLP(n_features=dim, 
                                             neumiss_depth=30,
                                             mlp_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=neumiss_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_neumiss[did, sid] = appendResults(neumiss_model, test_loader, device, training_time, regression)
            
        else:
            # Fit Custom Dropout Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (Custom Dropout - Classification)\n')
            start_time = time.time()
            dropout_model = CustomDropoutModel(input_layer=dim, 
                                               na_layers=[2**int(np.log2(dim)+1)], 
                                               model_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=dropout_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_dropout[did, sid] = appendResults(dropout_model, test_loader, device, training_time, regression)

            # Fit NeuMiss Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (NeuMiss - Classification)\n')
            start_time = time.time()
            neumiss_model = CustomNeuMissMLP(n_features=dim, 
                                             neumiss_depth=30,
                                             mlp_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=neumiss_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_neumiss[did, sid] = appendResults(neumiss_model, test_loader, device, training_time, regression)

        print(f'\nTraining results: {results_dropout[did, sid][3] if regression else results_dropout[did, sid][-1]:.4f} (Dropout) | {results_neumiss[did, sid][3] if regression else results_neumiss[did, sid][-1]:.4f} (NeuMiss)\n')
        np.save('../results/raw/results_dropout_MCAR.npy', results_dropout)
        np.save('../results/raw/results_neumiss_MCAR.npy', results_neumiss)

Dataset: 1, Seed: 1, Model: (Custom Dropout - Regression)

Epoch 500, val loss: 13569.1884765625
Epoch 1000, val loss: 6349.57666015625
Epoch 1500, val loss: 2471.039306640625
Epoch 2000, val loss: 543.5783081054688
Epoch 2500, val loss: 233.14556884765625
Epoch 3000, val loss: 117.56036376953125
Epoch 3500, val loss: 59.41341018676758
Epoch 4000, val loss: 39.628868103027344
Epoch 4500, val loss: 31.80002784729004
Epoch 5000, val loss: 29.001121520996094
Dataset: 1, Seed: 1, Model: (NeuMiss - Regression)

Epoch 500, val loss: 2707.973388671875
Epoch 1000, val loss: 1023.2122192382812
Epoch 1500, val loss: 229.57289123535156
Epoch 2000, val loss: 82.78677368164062
Epoch 2500, val loss: 53.61659240722656
Epoch 3000, val loss: 41.883968353271484
Epoch 3500, val loss: 35.67790222167969
Epoch 4000, val loss: 32.16339111328125
Epoch 4500, val loss: 30.31755828857422
Epoch 5000, val loss: 29.157289505004883

Training results: 0.3942 (Dropout) | 0.4670 (NeuMiss)

Dataset: 1, Seed: 2, Model: (

# MNAR

In [42]:
# Time
import time

# Sklearn Metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# MLP
from models import CustomDropoutModel, CustomNeuMissMLP
from train_utils import train

# Evaluation
metrics = ['time', 'mse', 'mae', 'r2', 'acc', 'prec', 'rec', 'f1', 'roc_auc']

# Experiment
seeds = [int(bin(i)[2:]) for i in list(range(5))]
results_dropout = np.zeros((len(datasets_MNAR), len(seeds), len(metrics)))
results_neumiss = np.zeros((len(datasets_MNAR), len(seeds), len(metrics)))

for did, mnar_dataset in enumerate(datasets_MNAR):
    for sid, seed in enumerate(seeds):
        # Prepare data
        X_train, X_val, X_test, y_train, y_val, y_test, dim, regression = prepareData(mnar_dataset, seed)
        train_loader, val_loader, test_loader = data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test)
        
        if regression:
            # Fit Custom Dropout Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (Custom Dropout - Regression)\n')
            start_time = time.time()
            dropout_model = CustomDropoutModel(input_layer=dim, 
                                               na_layers=[2**int(np.log2(dim)+1)], 
                                               model_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=dropout_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_dropout[did, sid] = appendResults(dropout_model, test_loader, device, training_time, regression)

            # Fit NeuMiss Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (NeuMiss - Regression)\n')
            start_time = time.time()
            neumiss_model = CustomNeuMissMLP(n_features=dim, 
                                             neumiss_depth=30,
                                             mlp_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=neumiss_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_neumiss[did, sid] = appendResults(neumiss_model, test_loader, device, training_time, regression)
            
        else:
            # Fit Custom Dropout Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (Custom Dropout - Classification)\n')
            start_time = time.time()
            dropout_model = CustomDropoutModel(input_layer=dim, 
                                               na_layers=[2**int(np.log2(dim)+1)], 
                                               model_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=dropout_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_dropout[did, sid] = appendResults(dropout_model, test_loader, device, training_time, regression)

            # Fit NeuMiss Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (NeuMiss - Classification)\n')
            start_time = time.time()
            neumiss_model = CustomNeuMissMLP(n_features=dim, 
                                             neumiss_depth=30,
                                             mlp_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=neumiss_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_neumiss[did, sid] = appendResults(neumiss_model, test_loader, device, training_time, regression)

        print(f'\nTraining results: {results_dropout[did, sid][3] if regression else results_dropout[did, sid][-1]:.4f} (Dropout) | {results_neumiss[did, sid][3] if regression else results_neumiss[did, sid][-1]:.4f} (NeuMiss)\n')
        np.save('../results/raw/results_dropout_MNAR.npy', results_dropout)
        np.save('../results/raw/results_neumiss_MNAR.npy', results_neumiss)

Dataset: 1, Seed: 1, Model: (Custom Dropout - Regression)

Epoch 500, val loss: 13515.3828125
Epoch 1000, val loss: 7335.3916015625
Epoch 1500, val loss: 4007.19677734375
Epoch 2000, val loss: 1268.1922607421875
Epoch 2500, val loss: 432.17218017578125
Epoch 3000, val loss: 143.4412078857422
Epoch 3500, val loss: 78.02128601074219
Epoch 4000, val loss: 46.11606216430664
Epoch 4500, val loss: 33.57711410522461
Epoch 5000, val loss: 26.761905670166016
Dataset: 1, Seed: 1, Model: (NeuMiss - Regression)

Epoch 500, val loss: 1144.7779541015625
Epoch 1000, val loss: 619.6669311523438
Epoch 1500, val loss: 340.102783203125
Epoch 2000, val loss: 190.78880310058594
Epoch 2500, val loss: 107.05058288574219
Epoch 3000, val loss: 63.77729797363281
Epoch 3500, val loss: 45.013710021972656
Epoch 4000, val loss: 37.328758239746094
Epoch 4500, val loss: 31.453142166137695
Epoch 5000, val loss: 27.65372085571289

Training results: 0.4511 (Dropout) | 0.5571 (NeuMiss)

Dataset: 1, Seed: 2, Model: (Custo

# REAL MISSINGNESS

In [6]:
# Time
import time

# MLP
from models import CustomDropoutModel, CustomNeuMissMLP
from train_utils import train

# Evaluation
metrics = ['time', 'mse', 'mae', 'r2', 'acc', 'prec', 'rec', 'f1', 'roc_auc']

# Experiment
seeds = [int(bin(i)[2:]) for i in list(range(5))]
results_dropout = np.zeros((len(datasets), len(seeds), len(metrics)))
results_neumiss = np.zeros((len(datasets), len(seeds), len(metrics)))

for did, dataset in enumerate(datasets):
    for sid, seed in enumerate(seeds):
        # Prepare data
        X_train, X_val, X_test, y_train, y_val, y_test, dim, regression = prepareData(dataset, seed)
        train_loader, val_loader, test_loader = data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test)
        
        if regression:
            # Fit Custom Dropout Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (Custom Dropout - Regression)\n')
            start_time = time.time()
            dropout_model = CustomDropoutModel(input_layer=dim, 
                                               na_layers=[2**int(np.log2(dim)+1)], 
                                               model_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=dropout_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_dropout[did, sid] = appendResults(dropout_model, test_loader, device, training_time, regression)

            # Fit NeuMiss Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (NeuMiss - Regression)\n')
            start_time = time.time()
            neumiss_model = CustomNeuMissMLP(n_features=dim, 
                                             neumiss_depth=30,
                                             mlp_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=neumiss_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_neumiss[did, sid] = appendResults(neumiss_model, test_loader, device, training_time, regression)
            
        else:
            # Fit Custom Dropout Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (Custom Dropout - Classification)\n')
            start_time = time.time()
            dropout_model = CustomDropoutModel(input_layer=dim, 
                                               na_layers=[2**int(np.log2(dim)+1)], 
                                               model_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=dropout_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_dropout[did, sid] = appendResults(dropout_model, test_loader, device, training_time, regression)

            # Fit NeuMiss Model
            print(f'Dataset: {did+1}, Seed: {sid+1}, Model: (NeuMiss - Classification)\n')
            start_time = time.time()
            neumiss_model = CustomNeuMissMLP(n_features=dim, 
                                             neumiss_depth=30,
                                             mlp_layers=[2**int(np.log2(dim)+1), 2**int(np.log2(dim))])
            epochs = 5000
            patience = 100
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            train(model=neumiss_model, 
                  train_loader=train_loader, 
                  val_loader=val_loader, 
                  epochs=epochs, 
                  patience=patience, 
                  regression_flag=regression, 
                  device=device, 
                  seed=seed,
                  verbose = True)
            end_time = time.time()

            training_time = end_time - start_time
            results_neumiss[did, sid] = appendResults(neumiss_model, test_loader, device, training_time, regression)

        print(f'\nTraining results: {results_dropout[did, sid][3] if regression else results_dropout[did, sid][-1]:.4f} (Dropout) | {results_neumiss[did, sid][3] if regression else results_neumiss[did, sid][-1]:.4f} (NeuMiss)\n')
        np.save('../results/raw/results_dropout_real.npy', results_dropout)
        np.save('../results/raw/results_neumiss_real.npy', results_neumiss)

Dataset: 1, Seed: 1, Model: (Custom Dropout - Classification)

Epoch 500, val loss: 0.5226673483848572
Dataset: 1, Seed: 1, Model: (NeuMiss - Classification)


Training results: 0.7300 (Dropout) | 0.7171 (NeuMiss)

Dataset: 1, Seed: 2, Model: (Custom Dropout - Classification)

Epoch 500, val loss: 0.5339129567146301
Dataset: 1, Seed: 2, Model: (NeuMiss - Classification)

Epoch 500, val loss: 0.5232256650924683

Training results: 0.6786 (Dropout) | 0.6863 (NeuMiss)

Dataset: 1, Seed: 3, Model: (Custom Dropout - Classification)

Dataset: 1, Seed: 3, Model: (NeuMiss - Classification)


Training results: 0.7301 (Dropout) | 0.7070 (NeuMiss)

Dataset: 1, Seed: 4, Model: (Custom Dropout - Classification)

Dataset: 1, Seed: 4, Model: (NeuMiss - Classification)


Training results: 0.7198 (Dropout) | 0.7455 (NeuMiss)

Dataset: 1, Seed: 5, Model: (Custom Dropout - Classification)

Epoch 500, val loss: 0.5456133484840393
Dataset: 1, Seed: 5, Model: (NeuMiss - Classification)


Training results: 0.