In [2]:
import os

directory = '../processed_data/no_miss'
datasets = os.listdir(directory)
datasets_MCAR = [dataset for dataset in datasets if dataset[-7] == 'C']
datasets_MNAR = [dataset for dataset in datasets if dataset[-7] == 'N']

directory = '../processed_data/yes_miss'
datasets = os.listdir(directory)

print('MCAR datasets:', datasets_MCAR)
print('MNAR datasets:', datasets_MNAR)
print('Datasets with missing values:', datasets)

MCAR datasets: ['airfoil_MCAR.csv', 'christine_MCAR.csv', 'philippine_MCAR.csv', 'phoneme_MCAR.csv', 'wine_quality_MCAR.csv']
MNAR datasets: ['airfoil_MNAR.csv', 'christine_MNAR.csv', 'philippine_MNAR.csv', 'phoneme_MNAR.csv', 'wine_quality_MNAR.csv']
Datasets with missing values: ['cirrhosis.csv', 'equity.csv', 'fico.csv', 'support.csv', 'wiki.csv']


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def prepareDataWithNans(dataset_name, SEED):
    data = pd.read_csv(f'../processed_data/no_miss/{dataset_name}')
    y = data.y.values
    X = data.drop('y', axis=1).values
    n, dim = X.shape

    if len(data.y.unique()) == 2:
        regression = False
    else:
        regression = True

    # Split the data into training, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=SEED, stratify=y if not regression else None)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                      random_state=SEED, stratify=y_train if not regression else None)
       
    # Normalize and scale according to the training set
    eps = 1e-6
    mean = np.nanmean(X_train, axis=0)
    std = np.nanstd(X_train, axis=0) + eps
    X_train = (X_train - mean) / std
    X_val = (X_val - mean) / std
    X_test = (X_test - mean) / std
    
    return X_train, X_val, X_test, y_train, y_val, y_test, dim, regression

import torch
from torch.utils.data import DataLoader, TensorDataset

def data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test):
    train_batch = int(2**np.ceil(np.log2(X_train.shape[0]//10)))
    X_train, X_val, X_test = [torch.tensor(x, dtype=torch.float32) for x in [X_train, X_val, X_test]]
    y_train, y_val, y_test = [torch.tensor(y, dtype=torch.float32) for y in [y_train, y_val, y_test]]
    
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=train_batch, shuffle=True)
    val_loader, test_loader = [DataLoader(TensorDataset(X, y), batch_size=X.shape[0], shuffle=False) for X, y in [(X_val, y_val), (X_test, y_test)]]
    return train_loader, val_loader, test_loader

# MCAR

In [8]:
# Time
import time

# Random Forest
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluation
metrics = ['time', 'mse', 'mae', 'r2', 'acc', 'prec', 'rec', 'f1', 'roc_auc']

# Experiment
seeds = [int(bin(i)[2:]) for i in list(range(5))]
results_lgbm_forest = np.zeros((len(datasets_MCAR), len(seeds), len(metrics)))

for did, mcar_dataset in enumerate(datasets_MCAR):
    for sid, seed in enumerate(seeds):
        # Verbose
        print(f'Dataset: {did+1}, Seed: {sid+1} STARTING TRAINING\n')

        # Prepare data
        X_train, X_val, X_test, y_train, y_val, y_test, dim, regression = prepareDataWithNans(mcar_dataset, seed)
        train_loader, val_loader, test_loader = data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test)
        
        if regression:
            # Fit Random Forest
            start_time = time.time()
            model = lgb.LGBMRegressor(boosting_type='rf',
                          max_depth=4,
                          n_estimators=1000,
                          min_data_in_leaf=5,
                          feature_fraction=(dim**0.5)/dim,
                          random_state=seed,
                          verbosity=-1)
            model.fit(X_train, y_train)
            end_time = time.time()
            
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            training_time = end_time - start_time
            results_lgbm_forest[did, sid] = [training_time, mse, mae, r2, np.nan, np.nan, np.nan, np.nan, np.nan]
            
        else:
            # Fit Random Forest
            start_time = time.time()
            model = lgb.LGBMClassifier(boosting_type='rf',
                          max_depth=4,
                          n_estimators=500,
                          min_data_in_leaf=5,
                          feature_fraction=(dim**0.5)/dim,
                          random_state=seed,
                          verbosity=-1)
            model.fit(X_train, y_train)
            end_time = time.time()
            
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            training_time = end_time - start_time
            results_lgbm_forest[did, sid] = [training_time, np.nan, np.nan, np.nan, acc, prec, rec, f1, roc_auc]

        print(f'\nTraining results: {results_lgbm_forest[did, sid][3] if regression else results_lgbm_forest[did, sid][-1]:.4f} (RF)\n')
        np.save('../results/raw/results_lgbm_MCAR.npy', results_lgbm_forest)

Dataset: 1, Seed: 1 STARTING TRAINING


Training results: 0.4255 (RF)

Dataset: 1, Seed: 2 STARTING TRAINING


Training results: 0.2336 (RF)

Dataset: 1, Seed: 3 STARTING TRAINING


Training results: 0.2120 (RF)

Dataset: 1, Seed: 4 STARTING TRAINING


Training results: 0.4365 (RF)

Dataset: 1, Seed: 5 STARTING TRAINING


Training results: 0.2675 (RF)

Dataset: 2, Seed: 1 STARTING TRAINING


Training results: 0.7205 (RF)

Dataset: 2, Seed: 2 STARTING TRAINING


Training results: 0.6956 (RF)

Dataset: 2, Seed: 3 STARTING TRAINING


Training results: 0.7214 (RF)

Dataset: 2, Seed: 4 STARTING TRAINING


Training results: 0.7168 (RF)

Dataset: 2, Seed: 5 STARTING TRAINING


Training results: 0.7048 (RF)

Dataset: 3, Seed: 1 STARTING TRAINING


Training results: 0.7008 (RF)

Dataset: 3, Seed: 2 STARTING TRAINING


Training results: 0.6994 (RF)

Dataset: 3, Seed: 3 STARTING TRAINING


Training results: 0.6942 (RF)

Dataset: 3, Seed: 4 STARTING TRAINING


Training results: 0.6934 (RF)

Datase

# MNAR

In [12]:
# Time
import time

# Random Forest
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluation
metrics = ['time', 'mse', 'mae', 'r2', 'acc', 'prec', 'rec', 'f1', 'roc_auc']

# Experiment
seeds = [int(bin(i)[2:]) for i in list(range(5))]
results_lgbm_forest = np.zeros((len(datasets_MNAR), len(seeds), len(metrics)))

for did, mnar_dataset in enumerate(datasets_MNAR):
    for sid, seed in enumerate(seeds):
        # Verbose
        print(f'Dataset: {did+1}, Seed: {sid+1} STARTING TRAINING\n')

        # Prepare data
        X_train, X_val, X_test, y_train, y_val, y_test, dim, regression = prepareDataWithNans(mnar_dataset, seed)
        train_loader, val_loader, test_loader = data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test)
        
        if regression:
            # Fit Random Forest
            start_time = time.time()
            model = lgb.LGBMRegressor(boosting_type='rf',
                          max_depth=4,
                          n_estimators=1000,
                          min_data_in_leaf=5,
                          feature_fraction=(dim**0.5)/dim,
                          random_state=seed,
                          verbosity=-1)
            model.fit(X_train, y_train)
            end_time = time.time()
            
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            training_time = end_time - start_time
            results_lgbm_forest[did, sid] = [training_time, mse, mae, r2, np.nan, np.nan, np.nan, np.nan, np.nan]
            
        else:
            # Fit Random Forest
            start_time = time.time()
            model = lgb.LGBMClassifier(boosting_type='rf',
                          max_depth=4,
                          n_estimators=500,
                          min_data_in_leaf=5,
                          feature_fraction=(dim**0.5)/dim,
                          random_state=seed,
                          verbosity=-1)
            model.fit(X_train, y_train)
            end_time = time.time()
            
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            training_time = end_time - start_time
            results_lgbm_forest[did, sid] = [training_time, np.nan, np.nan, np.nan, acc, prec, rec, f1, roc_auc]

        print(f'\nTraining results: {results_lgbm_forest[did, sid][3] if regression else results_lgbm_forest[did, sid][-1]:.4f} (RF)\n')
        np.save('../results/raw/results_lgbm_MNAR.npy', results_lgbm_forest)

Dataset: 1, Seed: 1 STARTING TRAINING


Training results: 0.3708 (RF)

Dataset: 1, Seed: 2 STARTING TRAINING


Training results: 0.2486 (RF)

Dataset: 1, Seed: 3 STARTING TRAINING


Training results: 0.2185 (RF)

Dataset: 1, Seed: 4 STARTING TRAINING


Training results: 0.3602 (RF)

Dataset: 1, Seed: 5 STARTING TRAINING


Training results: 0.2570 (RF)

Dataset: 2, Seed: 1 STARTING TRAINING


Training results: 0.7186 (RF)

Dataset: 2, Seed: 2 STARTING TRAINING


Training results: 0.6919 (RF)

Dataset: 2, Seed: 3 STARTING TRAINING


Training results: 0.7196 (RF)

Dataset: 2, Seed: 4 STARTING TRAINING


Training results: 0.7196 (RF)

Dataset: 2, Seed: 5 STARTING TRAINING


Training results: 0.7113 (RF)

Dataset: 3, Seed: 1 STARTING TRAINING


Training results: 0.7060 (RF)

Dataset: 3, Seed: 2 STARTING TRAINING


Training results: 0.7053 (RF)

Dataset: 3, Seed: 3 STARTING TRAINING


Training results: 0.6993 (RF)

Dataset: 3, Seed: 4 STARTING TRAINING


Training results: 0.7079 (RF)

Datase

# REAL

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def prepareDataWithNans(dataset_name, SEED):
    data = pd.read_csv(f'../processed_data/yes_miss/{dataset_name}')
    y = data.y.values
    X = data.drop('y', axis=1).values
    n, dim = X.shape

    if len(data.y.unique()) == 2:
        regression = False
    else:
        regression = True

    # Split the data into training, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=SEED, stratify=y if not regression else None)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                      random_state=SEED, stratify=y_train if not regression else None)
       
    # Normalize and scale according to the training set
    eps = 1e-6
    mean = np.nanmean(X_train, axis=0)
    std = np.nanstd(X_train, axis=0) + eps
    X_train = (X_train - mean) / std
    X_val = (X_val - mean) / std
    X_test = (X_test - mean) / std
    
    return X_train, X_val, X_test, y_train, y_val, y_test, dim, regression

import torch
from torch.utils.data import DataLoader, TensorDataset

def data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test):
    train_batch = int(2**np.ceil(np.log2(X_train.shape[0]//10)))
    X_train, X_val, X_test = [torch.tensor(x, dtype=torch.float32) for x in [X_train, X_val, X_test]]
    y_train, y_val, y_test = [torch.tensor(y, dtype=torch.float32) for y in [y_train, y_val, y_test]]
    
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=train_batch, shuffle=True)
    val_loader, test_loader = [DataLoader(TensorDataset(X, y), batch_size=X.shape[0], shuffle=False) for X, y in [(X_val, y_val), (X_test, y_test)]]
    return train_loader, val_loader, test_loader

#-----------------------------------------------

# Time
import time

# Random Forest
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluation
metrics = ['time', 'mse', 'mae', 'r2', 'acc', 'prec', 'rec', 'f1', 'roc_auc']

# Experiment
seeds = [int(bin(i)[2:]) for i in list(range(5))]
results_lgbm_forest = np.zeros((len(datasets), len(seeds), len(metrics)))

for did, dataset in enumerate(datasets):
    for sid, seed in enumerate(seeds):
        # Verbose
        print(f'Dataset: {did+1}, Seed: {sid+1} STARTING TRAINING\n')

        # Prepare data
        X_train, X_val, X_test, y_train, y_val, y_test, dim, regression = prepareDataWithNans(dataset, seed)
        train_loader, val_loader, test_loader = data2Tensors(X_train, X_val, X_test, y_train, y_val, y_test)
        
        if regression:
            # Fit Random Forest
            start_time = time.time()
            model = lgb.LGBMRegressor(boosting_type='rf',
                          max_depth=4,
                          n_estimators=1000,
                          min_data_in_leaf=5,
                          feature_fraction=(dim**0.5)/dim,
                          random_state=seed,
                          verbosity=-1)
            model.fit(X_train, y_train)
            end_time = time.time()
            
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            training_time = end_time - start_time
            results_lgbm_forest[did, sid] = [training_time, mse, mae, r2, np.nan, np.nan, np.nan, np.nan, np.nan]
            
        else:
            # Fit Random Forest
            start_time = time.time()
            model = lgb.LGBMClassifier(boosting_type='rf',
                          max_depth=4,
                          n_estimators=500,
                          min_data_in_leaf=5,
                          feature_fraction=(dim**0.5)/dim,
                          random_state=seed,
                          verbosity=-1)
            model.fit(X_train, y_train)
            end_time = time.time()
            
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            training_time = end_time - start_time
            results_lgbm_forest[did, sid] = [training_time, np.nan, np.nan, np.nan, acc, prec, rec, f1, roc_auc]

        print(f'\nTraining results: {results_lgbm_forest[did, sid][3] if regression else results_lgbm_forest[did, sid][-1]:.4f} (RF)\n')
        np.save('../results/raw/results_lgbm_real.npy', results_lgbm_forest)

Dataset: 1, Seed: 1 STARTING TRAINING


Training results: 0.7377 (RF)

Dataset: 1, Seed: 2 STARTING TRAINING


Training results: 0.6990 (RF)

Dataset: 1, Seed: 3 STARTING TRAINING


Training results: 0.7274 (RF)

Dataset: 1, Seed: 4 STARTING TRAINING


Training results: 0.7480 (RF)

Dataset: 1, Seed: 5 STARTING TRAINING


Training results: 0.7660 (RF)

Dataset: 2, Seed: 1 STARTING TRAINING


Training results: 0.5767 (RF)

Dataset: 2, Seed: 2 STARTING TRAINING


Training results: 0.5793 (RF)

Dataset: 2, Seed: 3 STARTING TRAINING


Training results: 0.5972 (RF)

Dataset: 2, Seed: 4 STARTING TRAINING


Training results: 0.5735 (RF)

Dataset: 2, Seed: 5 STARTING TRAINING


Training results: 0.5798 (RF)

Dataset: 3, Seed: 1 STARTING TRAINING


Training results: 0.7203 (RF)

Dataset: 3, Seed: 2 STARTING TRAINING


Training results: 0.7196 (RF)

Dataset: 3, Seed: 3 STARTING TRAINING


Training results: 0.7310 (RF)

Dataset: 3, Seed: 4 STARTING TRAINING


Training results: 0.7380 (RF)

Datase