In [69]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

def return_split_con_cat_column(df_miss):
    columns = df_miss.columns
    columns_split = [i.split('_') for i in columns]
    con = []
    cat = []
    for index, i in enumerate(columns_split):
        if i[0][0:3] == 'con':
            con.append(columns[index])
        elif i[0][0:3] == 'cat':
            cat.append(columns[index])
        else:
            continue
    
    return con, cat

def return_dictionary_column(column_unique):
    dict_res = {}
    dict_rev = {}
    for index, i in enumerate(column_unique):
        dict_res[i] = index
        dict_rev[index] = i
    
    return dict_res, dict_rev 

def return_encode_column(column_m, dictionary, mode):
    column_encode = []
    
    if mode == 'one_hot':
        for index, i in enumerate(column_m):
            if i == np.nan or str(i) == 'nan':
                column_encode.append([np.nan for j in range(len(dictionary))])
            else:
                column_encode.append(list(np.eye(len(dictionary))[dictionary[i]]))
                
        array_encode = np.array(column_encode, dtype = np.float32)
    
    elif mode == 'embedding':
        for index, i in enumerate(column_m):
            if i == np.nan or str(i) == 'nan':
                column_encode.append([np.nan for j in range(len(dictionary))])
            else:
                column_encode.append(list(np.eye(len(dictionary))[dictionary[i]] * 2. - 1.))
                
        array_encode = np.array(column_encode, dtype = np.float32)
        
    return array_encode
    
    
def return_normalized_column(column_m, column_i, mode):
    
    if mode == 'one_hot':
        max_m = np.nanmax(np.array(column_m))
        min_m = np.nanmin(np.array(column_m))
        max_i = np.max(np.array(column_i))
        min_i = np.min(np.array(column_i))
        if min_m != max_m:
            column_m_normalize = [(i - min_m)/(max_m - min_m) for i in column_m]
            column_m_normalize = [i for i in column_m_normalize]
        else:
            column_m_normalize = [i * 0. for i in column_m]
    elif mode == 'embedding':
        max_m = np.nanmax(np.array(column_m))
        min_m = np.nanmin(np.array(column_m))
        max_i = np.max(np.array(column_i))
        min_i = np.min(np.array(column_i))
        if min_m != max_m:
            column_m_normalize = [(i - min_m)/(max_m - min_m) for i in column_m]
            column_m_normalize = [i * 2. - 1. for i in column_m_normalize]
        else:
            column_m_normalize = [0. for i in column_m]
    
    array_m_normalized = np.array(column_m_normalize, np.float32).reshape(len(column_m_normalize), 1)
    
    return array_m_normalized, max_m, min_m, max_i, min_i

def return_data_miss_and_full_train(miss_method, index_miss, index_file, mode = 'one_hot'):
    #print("Now, Data Frames for Case {0} with full version, missing {1:.2%} are being prepared...".format(index_case, float(index_miss/100)))
    df_miss = pd.read_csv('data_stored/data_miss/{}/Case14/miss{}/{}.csv'.format(miss_method, index_miss, index_file))
    df_full = pd.read_csv('data_stored/data/14.csv')
    con, cat = return_split_con_cat_column(df_miss = df_miss)
    
    #print(con, cat)
    # Define the labels, location below:
    labels = []
    labels_ori = []
    locations = []
    attach = []
    
    # Start encode the cat variables:
    #print("Now, the preprocessing of data have already started, with mode: " + mode)
    for index, i in enumerate(df_full.columns):
        if i in cat:
            column_m = df_miss[i].to_list()
            column_i = df_full[i].to_list()
            columns_m_unique = list(set([i for i in column_m if str(i) != 'nan']))
            columns_i_unique = list(set([i for i in column_i]))
            dict_res_column, dict_rev_column = return_dictionary_column(column_unique = columns_m_unique)
            dictori_res_column, dictori_rev_column = return_dictionary_column(column_unique = columns_i_unique)
            column_encode = return_encode_column(column_m = column_m, dictionary = dict_res_column, mode = mode)
            locations.append(column_encode.shape[1])
            labels.append(['cat', [dict_res_column, dict_rev_column]])
            labels_ori.append(['cat', [dictori_res_column, dictori_rev_column]])
            attach.append(['cat', columns_i_unique]) 
        elif i in con:
            column_m = df_miss[i].to_list()
            column_i = df_full[i].to_list()
            column_encode, max_m, min_m, max_i, min_i = return_normalized_column(column_m = column_m, column_i = column_i, mode = mode)
            locations.append(column_encode.shape[1])
            labels.append(['con', [max_m, min_m]])
            labels_ori.append(['con', [max_i, min_i]])
            attach.append(['con', [max_m, min_m]])
            
        array_encode_column = np.array(column_encode, np.float32)

        if index == 0:
            array_result = array_encode_column
        else:
            array_result = np.concatenate((array_result, array_encode_column), axis = 1)
            locations[-1] = locations[-1] + locations[-2]
    #print("Now, the preprocessing of data had been finished.\n")   
    return array_result, df_full.columns, locations, labels, df_full, df_miss, attach, labels_ori


In [70]:
import os
import numpy as np
import pandas as pd

def return_encode_column_R(column_m, dictionary):
    column_encode = []
    for index, i in enumerate(column_m):
        if i == np.nan or str(i) == "nan":
            column_encode.append([np.nan for j in range(len(dictionary))])
            print("...there is nan in imputed data...")
        else:
            column_encode.append(list(np.eye(len(dictionary))[dictionary[i]]))

    array_encode = np.array(column_encode, dtype = np.float32)
        
    return array_encode
    
    
def return_normalized_column_R(column_m):
    max_m = np.max(np.array(column_m))
    min_m = np.min(np.array(column_m))
    if min_m != max_m:
        column_m_normalize = [(i - min_m)/(max_m - min_m) for i in column_m]
        column_m_normalize = [i for i in column_m_normalize]
    else:
        column_m_normalize = [i * 0. for i in column_m]
    
    array_m_normalized = np.array(column_m_normalize, np.float32).reshape(len(column_m_normalize), 1)
    
    return array_m_normalized, max_m, min_m

def return_data_miss_and_full_R(miss_method, index_miss, index_file):
    # full imputed data
    df_miss = pd.read_csv('data_stored/data_dsan/{}/Case14/miss{}/{}.csv'.format(miss_method, index_miss, index_file))
    con, cat = return_split_con_cat_column(df_miss = df_miss)
    #print(con, cat)
    # Define the labels, location below:
    labels = []
    locations = []
    
    # Start encode the cat variables:
    #print("Now, the preprocessing of data have already started, with mode: " + mode)
    for index, i in enumerate(df_miss.columns):
        if i in cat:
            column_m = df_miss[i].to_list()
            columns_m_unique = list(set([i for i in column_m if str(i) != "nan"]))
            dict_res_column, dict_rev_column = return_dictionary_column(column_unique = columns_m_unique)
            column_encode = return_encode_column_R(column_m = column_m, dictionary = dict_res_column)
            locations.append(column_encode.shape[1])
            labels.append(["cat", [dict_res_column, dict_rev_column]])
        elif i in con:
            column_m = df_miss[i].to_list()
            column_encode, max_m, min_m = return_normalized_column_R(column_m = column_m)
            locations.append(column_encode.shape[1])
            labels.append(["con", [max_m, min_m]])
            
        array_encode_column = np.array(column_encode, np.float32)

        if index == 0:
            array_result = array_encode_column
        else:
            array_result = np.concatenate((array_result, array_encode_column), axis = 1)
            locations[-1] = locations[-1] + locations[-2]
    #print("Now, the preprocessing of data had been finished.\n")   
    return array_result, df_miss.columns, locations, labels, df_miss

In [71]:
def return_data_miss_and_full(miss_method, index_miss, index_file, mode = 'one_hot'):
    return return_data_miss_and_full_train(miss_method = miss_method, index_miss = index_miss, index_file = index_file, mode = mode)

In [72]:
import numpy as np
import pandas as pd

class Model_test():
    def __init__(self, label_reverse, label_ori, column_location, column_name, mode = 'one_hot'):
        self.mode = mode
        self.label_reverse = label_reverse
        self.label_ori = label_ori
        self.column_location = column_location
        self.column_name = column_name
        
       
    def return_accuary_for_con_cat(self, index, generate_i, mask_i, list_original_label):
        if self.label_reverse[index][0] == 'con':
            max_ = self.label_reverse[index][1][0]
            min_ = self.label_reverse[index][1][1]
            max_ori = self.label_ori[index][1][0]
            min_ori = self.label_ori[index][1][1]
            if self.mode == 'embedding':
                generate_i = (generate_i + 1.)/2.
            generate_i_re = generate_i * (max_ - min_) + min_
            generate_i_ori = (generate_i_re - min_ori)/(max_ori - min_ori)
            label_i_ori = np.array(list_original_label, np.float32).reshape(len(list_original_label), 1)
            if max_ori != min_ori:
                label_i_ori = (label_i_ori - min_ori)/(max_ori - min_ori)
            else:
                label_i_ori = (label_i_ori - min_ori) * 0.

            return ['con', np.sum(((generate_i_ori - label_i_ori)**2) * (1. - mask_i)), np.sum((1. - mask_i))]

        else:
            dictionary = self.label_reverse[index][1][1]
            generate_i_argmax = list(np.argmax(generate_i, axis = 1))
            mask_i_argmax = mask_i[:, 0]
            label_i_argmax = list_original_label
            result = np.array([dictionary[generate_i_argmax[i]] == label_i_argmax[i] for i in range(len(label_i_argmax))], dtype = np.float32)
            
            return ['cat', np.sum(result * (1. - mask_i_argmax)), np.sum((1. - mask_i_argmax))]
        
    def return_con_loss_cat_accuary_test_result(self, result):
        con_loss = 0
        con_mask_sum = 0
        cat_accuary = 0
        cat_mask_sum = 0
        
        for index, i in enumerate(result):
            if i[0] == 'con':
                con_loss = con_loss + i[1]
                con_mask_sum = con_mask_sum + i[2]
            else:
                cat_accuary = cat_accuary + i[1]
                cat_mask_sum = cat_mask_sum + i[2]
                
        if con_mask_sum == 0:
            con_mask_sum = con_mask_sum + 1  
        if cat_mask_sum == 0:
            cat_mask_sum = cat_mask_sum + 1
            
        return float(np.sqrt(con_loss/con_mask_sum)), float(cat_accuary/cat_mask_sum)
        
    def cross_validation_result(self, data_list, mask_list, df_full_list, df_miss_list):
        for index, i in enumerate(data_list):
            if index == 0:
                data = i
                mask = mask_list[index]
            else:
                data = np.concatenate((data, i), axis = 0)
                mask = np.concatenate((mask, mask_list[index]), axis = 0)
                
        df_full = pd.concat(df_full_list)
        df_miss = pd.concat(df_miss_list)
        return data, mask, df_full, df_miss
        
    def model_test(self, data = None, mask = None, df_original = None):
        result = []
        
        for index, i in enumerate(self.column_name):
            if index == 0:
                generate_i = data[:, 0:self.column_location[index]]
                mask_i = mask[:, 0:self.column_location[index]]
                list_original_label = df_original[i].to_list()
                re = self.return_accuary_for_con_cat(index = index, generate_i = generate_i, mask_i = mask_i, list_original_label = list_original_label)
                result.append(re)

            else:
                generate_i = data[:, self.column_location[index - 1]:self.column_location[index]]
                mask_i = mask[:, self.column_location[index - 1]:self.column_location[index]]
                list_original_label = df_original[i].to_list()
                re = self.return_accuary_for_con_cat(index = index, generate_i = generate_i, mask_i = mask_i, list_original_label = list_original_label)
                result.append(re)

        con_loss, cat_accuray = self.return_con_loss_cat_accuary_test_result(result = result)
        return con_loss, cat_accuray


In [73]:
import os
import sys
import numpy as np
import pandas as pd

def returnMedianrange(con_loss_last_array, cat_accuracy_last_array, num_of_test, miss_method, index_miss, name):
    
    con_loss_last_array = [i for i in con_loss_last_array if str(i) != 'nan']
    cat_accuracy_last_array = [i for i in cat_accuracy_last_array if str(i) != 'nan']
    
    main_path = os.path.join(os.getcwd(), 'performance')
    if not os.path.exists(main_path):
        os.mkdir(main_path)
    method_path = os.path.join(main_path, miss_method)
    if not os.path.exists(method_path):
        os.mkdir(method_path)
    miss_case = os.path.join(method_path, 'Case14')
    if not os.path.exists(miss_case):
        os.mkdir(miss_case)
    miss_path = os.path.join(miss_case, 'miss{0}'.format(index_miss))
    if not os.path.exists(miss_path):
        os.mkdir(miss_path)
    perform_path = os.path.join(miss_path, name + '_performance.csv')
    perform_summary_path = os.path.join(miss_path, name + '_summary.csv')
    
    #df_res = pd.DataFrame()
    #df_res['con_loss'] = con_loss_last_array
    #df_res['cat_accuracy'] = cat_accuracy_last_array
    #df_res.to_csv(perform_path, index = False)

    con_loss_last_array.sort()
    cat_accuracy_last_array.sort()
    num_of_test = len(con_loss_last_array)
    medCon = np.median(con_loss_last_array)
    medCat = np.median(cat_accuracy_last_array)
    q1Con = np.median(con_loss_last_array[:int(num_of_test/2)])
    q2Con = np.median(con_loss_last_array[int(num_of_test/2):])
    q1Cat = np.median(cat_accuracy_last_array[:int(num_of_test/2)])
    q2Cat = np.median(cat_accuracy_last_array[int(num_of_test/2):])
    

    meanCon = np.mean(con_loss_last_array)
    varCon = np.std(con_loss_last_array)
    meanCat = np.mean(cat_accuracy_last_array)
    varCat = np.std(cat_accuracy_last_array)

    df_rem = pd.DataFrame()
    df_rem['medCon'] = [medCon]
    df_rem['qRCon'] = [q2Con - q1Con]
    df_rem['medCat'] = [medCat]
    df_rem['qRCat'] = [q2Cat - q1Cat]

    df_rem['meanCon'] = [np.around(meanCon, 4)]
    df_rem['varCon'] = [np.around(varCon, 4)]
    df_rem['meanCat'] = [np.around(meanCat, 4)]
    df_rem['varCat'] = [np.around(varCat, 4)]
    df_rem.to_csv(perform_summary_path, index = False)


In [74]:
def return_mask_of_data(data):
    mask = 1. - np.isnan(data)   
    return mask
  
def Rimputed_test(miss_method, index_miss, imputed_method = 'MissForest_py', num_of_test = 100):
    con_res = []
    cat_res = []
    for index in tqdm(range(num_of_test)):
        try:
            data, _, _, _, df_full, _, _, labels_ori = return_data_miss_and_full(miss_method = miss_method, index_miss = index_miss, index_file = index, mode = 'one_hot')

            mask = return_mask_of_data(data = data)
            #print("Now, Data Frames for Case {0} with full version, missing {1:.2%} are being prepared...".format(index_case, float(index_miss/100)))
            data_imputed, column_name, column_location, labels, _ = return_data_miss_and_full_R(miss_method = miss_method, index_miss = index_miss, index_file = index)

            mt = Model_test(label_reverse = labels, label_ori = labels_ori, column_location = column_location, column_name = column_name)
            con_loss, cat_acc = mt.model_test(data = data_imputed, mask = mask, df_original = df_full)
            con_res.append(con_loss)
            cat_res.append(cat_acc)
        except:
            continue

    #print("Now, the preprocessing of data had been finished.\n")
    returnMedianrange(con_loss_last_array = con_res, cat_accuracy_last_array = cat_res, num_of_test = num_of_test, miss_method = miss_method, index_miss = index_miss, name = imputed_method)
    gc.collect()


In [75]:
import numpy as np
import pandas as pd
import gc

index_miss_list = [10, 30, 50]
num_of_test = 10
miss_methods = ["MCAR","MAR",'MNAR']
for miss_method in miss_methods:
    for index_miss in tqdm(index_miss_list):  
        Rimputed_test(miss_method = miss_method, index_miss = index_miss, imputed_method = 'DSAN', num_of_test = num_of_test)
gc.collect()

  0%|                                                         | 0/3 [00:00<?, ?it/s]
  0%|                                                        | 0/10 [00:00<?, ?it/s][A
 10%|████▊                                           | 1/10 [00:00<00:03,  2.62it/s][A
 20%|█████████▌                                      | 2/10 [00:00<00:02,  3.07it/s][A
 30%|██████████████▍                                 | 3/10 [00:00<00:02,  3.13it/s][A
 40%|███████████████████▏                            | 4/10 [00:01<00:01,  3.04it/s][A
 50%|████████████████████████                        | 5/10 [00:01<00:01,  3.09it/s][A
 60%|████████████████████████████▊                   | 6/10 [00:01<00:01,  3.13it/s][A
 70%|█████████████████████████████████▌              | 7/10 [00:02<00:00,  3.16it/s][A

...there is nan in imputed data...



 80%|██████████████████████████████████████▍         | 8/10 [00:02<00:00,  3.17it/s][A
 90%|███████████████████████████████████████████▏    | 9/10 [00:02<00:00,  3.18it/s][A
100%|███████████████████████████████████████████████| 10/10 [00:03<00:00,  3.12it/s][A
 33%|████████████████▎                                | 1/3 [00:03<00:06,  3.25s/it]

...there is nan in imputed data...



  0%|                                                        | 0/10 [00:00<?, ?it/s][A
 10%|████▊                                           | 1/10 [00:00<00:02,  3.63it/s][A
 20%|█████████▌                                      | 2/10 [00:00<00:02,  3.46it/s][A
 30%|██████████████▍                                 | 3/10 [00:00<00:02,  3.40it/s][A
 40%|███████████████████▏                            | 4/10 [00:01<00:01,  3.31it/s][A
 50%|████████████████████████                        | 5/10 [00:01<00:01,  3.33it/s][A
 60%|████████████████████████████▊                   | 6/10 [00:01<00:01,  3.32it/s][A
 70%|█████████████████████████████████▌              | 7/10 [00:02<00:00,  3.33it/s][A
 80%|██████████████████████████████████████▍         | 8/10 [00:02<00:00,  3.34it/s][A
 90%|███████████████████████████████████████████▏    | 9/10 [00:02<00:00,  3.36it/s][A
100%|███████████████████████████████████████████████| 10/10 [00:02<00:00,  3.36it/s][A
 67%|██████████████████████████

 70%|█████████████████████████████████▌              | 7/10 [00:01<00:00,  3.50it/s][A
 80%|██████████████████████████████████████▍         | 8/10 [00:02<00:00,  3.49it/s][A
 90%|███████████████████████████████████████████▏    | 9/10 [00:02<00:00,  3.52it/s][A
100%|███████████████████████████████████████████████| 10/10 [00:02<00:00,  3.52it/s][A
100%|█████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.00s/it]


0