In [1]:
import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
import tensorflow.keras.optimizers as Optimizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import keras_tuner as kt

import numpy as np
import pandas as pd

from AutoEncoder import DAE, VAE, CAE, DuoLossAE
from TargetEncoder import KFoldTargetEncoderTrain
import HandleData

import json

import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from missingpy import MissForest
import os

import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use("ggplot")
import warnings
warnings.filterwarnings("ignore")

In [None]:
print(tf.config.list_physical_devices("GPU"))
print(tf.test.is_built_with_cuda())

In [None]:
#Đọc dữ liệu
list_data = []
for url_data in os.listdir("Data"):
    if "csv" in url_data:
        dt = pd.read_csv("Data/{}".format(url_data))
        dt = dt.rename(columns={each: str(each) for each in dt.columns.to_list()})
        list_data.append(dt)

In [None]:
def create_missing_label_data(df, label, thre = 0.2, mechanism = "mcar", method = "uniform", norm_method=True):
    # df = df.drop(columns=[label])
    df[label] = LabelEncoder().fit_transform(df[label])
    label_df = df[label].values
    cat_cols = [i for i in df.select_dtypes(include='object').columns if i != label]
    num_cols = [i for i in df.select_dtypes(include=['int64', 'float64']).columns if df[i].nunique() != 2 and i != label]
    binary_cols = [i for i in df.select_dtypes(include=['int64', 'float64']).columns if df[i].nunique() == 2 and i != label]
    #Missing data
    df, mask = HandleData.missing_method(df, mechanism, method, thre)
    data, new_col = label_df.reshape(len(label_df), 1), [label]
    #Normalize data
    std, col_target = None, None
    if norm_method is True:
        if len(num_cols) != 0:
            std = StandardScaler().fit(df[num_cols])
            data_std = std.transform(df[num_cols])
            new_col = [*new_col, *num_cols]
            data = np.hstack((data, data_std))
        if len(cat_cols) != 0:
            for each in cat_cols:
                targetc = KFoldTargetEncoderTrain(each, label, n_fold=10, verbosity=False)
                df = targetc.fit_transform(df)
            col_target = [each + "_tar" for each in cat_cols]
            new_col = [*new_col, *col_target]
            data = np.hstack((data, df[col_target]))
        if len(binary_cols):
            new_col = [*new_col, *binary_cols]
            data = np.hstack((data, df[binary_cols]))
    
    if norm_method is True:
        df = pd.DataFrame(
            data = data, columns = new_col
        )
    cat_cols = col_target
    return df, mask, cat_cols

In [None]:
#Tạo missing data, những giá trị dạng categorical được encode bằng target encoding
list_20per_miss = []
list_40per_miss = []
list_60per_miss = []
for idx, each in enumerate(list_data):
    #20% missing data
    df_new_per20_1, _, cat_cols = create_missing_label_data(each, 'label', thre=0.2, mechanism = "mcar", method = "uniform")
    df_new_per20_2, _, cat_cols = create_missing_label_data(each, 'label', thre=0.2, mechanism = "mcar", method = "random")
    df_new_per20_3, _, cat_cols = create_missing_label_data(each, 'label', thre=0.2, mechanism = "mnar", method = "uniform")
    df_new_per20_4, _, cat_cols = create_missing_label_data(each, 'label', thre=0.2, mechanism = "mnar", method = "random")
    #40% missng data
    df_new_per40_1, _, cat_cols = create_missing_label_data(each, 'label', thre=0.4, mechanism = "mcar", method = "uniform")
    df_new_per40_2, _, cat_cols = create_missing_label_data(each, 'label', thre=0.4, mechanism = "mcar", method = "random")
    df_new_per40_3, _, cat_cols = create_missing_label_data(each, 'label', thre=0.4, mechanism = "mnar", method = "uniform")
    df_new_per40_4, _, cat_cols = create_missing_label_data(each, 'label', thre=0.4, mechanism = "mnar", method = "random")
    #60% missing data
    df_new_per60_1, _, cat_cols = create_missing_label_data(each, 'label', thre=0.6, mechanism = "mcar", method = "uniform")
    df_new_per60_2, _, cat_cols = create_missing_label_data(each, 'label', thre=0.6, mechanism = "mcar", method = "random")
    df_new_per60_3, _, cat_cols = create_missing_label_data(each, 'label', thre=0.6, mechanism = "mnar", method = "uniform")
    df_new_per60_4, _, cat_cols = create_missing_label_data(each, 'label', thre=0.6, mechanism = "mnar", method = "random")
    
    obj_1 = {"mcar_uni": df_new_per20_1, "mcar_ran": df_new_per20_2, "mnar_uni": df_new_per20_3, "mnar_ran": df_new_per20_4, "cat_cols": cat_cols}
    obj_2 = {"mcar_uni": df_new_per40_1, "mcar_ran": df_new_per40_2, "mnar_uni": df_new_per40_3, "mnar_ran": df_new_per40_4, "cat_cols": cat_cols}
    obj_3 = {"mcar_uni": df_new_per60_1, "mcar_ran": df_new_per60_2, "mnar_uni": df_new_per60_3, "mnar_ran": df_new_per60_4, "cat_cols": cat_cols}
    list_20per_miss.append(obj_1)
    list_40per_miss.append(obj_2)
    list_60per_miss.append(obj_3)

In [None]:
def model_for_classification(num_class, learning_rate=0.005):
    model = Sequential([
        Dense(units=128, activation="relu", kernel_initializer="he_normal"),
        BatchNormalization(),
        Dropout(rate=0.2),
        Dense(units=64, activation="relu", kernel_initializer="he_normal"),
        BatchNormalization(),
        Dropout(rate=0.1),
        Dense(num_class, activation="softmax")
    ])
    adam = Optimizer.Adam(learning_rate=learning_rate)
    model.compile(optimizer=adam, loss="sparse_categorical_crossentropy", metrics="accuracy")
    return model

In [None]:
def result_each_method(model, X_, y_):
    earlyStop = EarlyStopping(monitor="loss", patience=8, mode="min")
    model.fit(X_[0], y_[0], epochs=80, batch_size=128, verbose=-1, callbacks=[earlyStop])
    pred = model.predict(X_[1])
    pred = np.argmax(pred, axis=1)
    return accuracy_score(y_[1], pred), f1_score(y_[1], pred, average="macro")

# Thử nghiệm với KNN, MissForest, MICE

In [None]:
def fill_missing_split_norm_method(df, col_cate, label='label'):
    label_ = df[label]
    df = df.drop(columns=[label])
    X_train, X_test, y_train, y_test = train_test_split(df, label_, test_size=0.2, random_state=209)
    #KNNimputation
    knn_imputer = KNNImputer(n_neighbors=10).fit(X_train)
    X_train_knnimp = knn_imputer.transform(X_train)
    X_test_knnimp = knn_imputer.transform(X_test)
    #MissForest
    X_train_missForest, X_test_missForest = X_train.copy(), X_test.copy()
    if col_cate is not None:
        X_train_missForest[col_cate] = X_train_missForest[col_cate].astype('category')
        X_test_missForest[col_cate] = X_test_missForest[col_cate].astype('category')
    rdf_imputer = MissForest(max_depth=5, random_state=209, max_iter=8, verbose=0).fit(X_train_missForest)
    X_train_missForest = rdf_imputer.transform(X_train_missForest)
    X_test_missForest = rdf_imputer.transform(X_test_missForest)
    #MICE
    mice_imputer = IterativeImputer(random_state=209, max_iter=8, verbose=-1).fit(X_train)
    X_train_mice = mice_imputer.transform(X_train)
    X_test_mice = mice_imputer.transform(X_test)
    return (X_train_knnimp, X_test_knnimp), (X_train_missForest, X_test_missForest), (X_train_mice, X_test_mice), (y_train, y_test)

In [None]:
list_20_res, list_40_res, list_60_res = [], [], []
for idx, each in enumerate(list_20per_miss[:1]):
    for key, value in list_20per_miss[idx].items():
        if key == "cat_cols":
            continue
        knnimp_per20,  msForest_per20, mice_per20, label_set = fill_missing_split_norm_method(value, list_20per_miss[idx]["cat_cols"])
        acc_knn, f1_knn = result_each_method(model_for_classification(len(set(label_set[0]))), knnimp_per20, label_set)
        acc_rdf, f1_rdf = result_each_method(model_for_classification(len(set(label_set[0]))), msForest_per20, label_set)
        acc_mice, f1_mice = result_each_method(model_for_classification(len(set(label_set[0]))), mice_per20, label_set)
        obj = {"Data": idx, "type": key, "KNN": (acc_knn, f1_knn), "RDF": (acc_rdf, f1_rdf), "MICE": (acc_mice, f1_mice)}
        list_20_res.append(obj)
    #40% missng data
    for key, value in list_40per_miss[idx].items():
        if key == "cat_cols":
            continue
        knnimp_per40,  msForest_per40, mice_per40, label_set = fill_missing_split_norm_method(value, list_40per_miss[idx]["cat_cols"])
        acc_knn, f1_knn = result_each_method(model_for_classification(len(set(label_set[0]))), knnimp_per40, label_set)
        acc_rdf, f1_rdf = result_each_method(model_for_classification(len(set(label_set[0]))), msForest_per40, label_set)
        acc_mice, f1_mice = result_each_method(model_for_classification(len(set(label_set[0]))), mice_per40, label_set)
        obj = {"Data": idx, "type": key, "KNN": (acc_knn, f1_knn), "RDF": (acc_rdf, f1_rdf), "MICE": (acc_mice, f1_mice)}
        list_40_res.append(obj)
    #60% missing data
    for key, value in list_60per_miss[idx].items():
        if key == "cat_cols":
            continue
        knnimp_per60,  msForest_per60, mice_per60, label_set = fill_missing_split_norm_method(value, list_60per_miss[idx]["cat_cols"])
        acc_knn, f1_knn = result_each_method(model_for_classification(len(set(label_set[0]))), knnimp_per60, label_set)
        acc_rdf, f1_rdf = result_each_method(model_for_classification(len(set(label_set[0]))), msForest_per60, label_set)
        acc_mice, f1_mice = result_each_method(model_for_classification(len(set(label_set[0]))), mice_per60, label_set)
        obj = {"Data": idx, "type": key, "KNN": (acc_knn, f1_knn), "RDF": (acc_rdf, f1_rdf), "MICE": (acc_mice, f1_mice)}
        list_60_res.append(obj)

In [None]:
list_20_res = json.load(open("Result/result_per20.json"))
list_40_res = json.load(open("Result/result_per40.json"))
list_60_res = json.load(open("Result/result_per60.json"))

# Thử nghiệm với AutoEncoder

In [None]:
def compile_model(model, lr=0.01):
    # sgd = Optimizer.SGD(learning_rate=lr, momentum=0.99, nesterov=True)
    adam = Optimizer.Adam(learning_rate=lr)
    model.compile(optimizer=adam, loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.mean_squared_error])
    return model

In [None]:
def build_model_ae(input_shape, type_model = "dae"):
    if type_model == "dae":  
        def buil_model_tuner_dae(hp):
            model_dae= compile_model(DAE.create_model(input_size=input_shape, 
                                                      num_layer=hp.Choice('num_layer', [2, 3, 4], default=3, ordered=True),
                                                      drop_out_rate=hp.Choice('dr_rate', [0.4, 0.5], default=0.5, ordered=True),
                                                      theta=hp.Choice('theta', [5,7,9], default=7, ordered=True),
                                                      activation_func=hp.Choice('acf', ['tanh', 'relu'], default='tanh', ordered=False)))
            return model_dae
        tuner = kt.BayesianOptimization(
                    buil_model_tuner_dae,
                    objective=kt.Objective("loss", direction="min"),
                    max_trials=20
                  )
        return tuner
    elif type_model == "vae" or type_model == "cae":
         def buil_model_tuner_vae(hp):
            units_layer_1 = hp.Choice('units_layer_1', [input_shape - 3, input_shape + 5, input_shape + 10], ordered=True) 
            units_layer_2 = hp.Choice('units_layer_2', [input_shape - 5, input_shape + 8, input_shape + 15], ordered=True) 
            latent_dim = hp.Choice('latent_dim', [5, 6 ,7], ordered=True)
            model_vae= VAE.compile_model(VAE.create_model(input_size=input_shape, 
                                                          layer_units=[units_layer_1, units_layer_2],
                                                          latent_dim=latent_dim,
                                                          activation_func=hp.Choice('acf', ['tanh', 'relu'], default='relu', ordered=False)))
            return model_vae
        
         tuner = kt.BayesianOptimization(
                    buil_model_tuner_vae,
                    objective=kt.Objective("total_loss", direction="min"),
                    max_trials=20
                      )
         return tuner
    elif type_model == "cae":
        return None
    else:
        print("Không có mô hình AutoEncoder phù hợp")
        return None

In [None]:
def fill_missing_split_ae_method(df, label='label'):
    label_ = df[label]
    df = df.drop(columns=[label])
    X_train, X_test, y_train, y_test = train_test_split(df, label_, test_size=0.2, random_state=209)
    for each in X_train.columns.to_list():
        X_train[each] = X_train[each].fillna(X_train[each].mean())
        X_test[each] = X_test[each].fillna(X_train[each].mean())
    tuner_dae = build_model_ae(X_train.shape[1], type_model='dae')
    tuner_vae = build_model_ae(X_train.shape[1], type_model='vae')
    tuner_dae.search(X_train, X_train, epochs=120, batch_size=32,
                     callbacks=[EarlyStopping(monitor='loss', patience=20, mode='min'), 
                                ReduceLROnPlateau(monitor='loss', factor=0.8, patience=5, mode='min')], verbose=0)
    tuner_vae.search(X_train.values, epochs=120, batch_size=32,
                     callbacks=[EarlyStopping(monitor='total_loss', patience=20, mode='min'), 
                                ReduceLROnPlateau(monitor='total_loss', factor=0.8, patience=5, mode='min')], verbose=0)
    best_model_dae = tuner_dae.get_best_models(1)[0]
    best_model_vae = tuner_vae.get_best_models(1)[0]
    train_fill_dae, val_fill_dae = best_model_dae.predict(X_train), best_model_dae.predict(X_test)
    train_fill_vae, val_fill_vae = best_model_vae.predict(X_train), best_model_vae.predict(X_test)
    return (train_fill_dae, val_fill_dae), (train_fill_vae, val_fill_vae), (y_train, y_test)

In [None]:
list_20_res_ae, list_40_res_ae, list_60_res_ae = [], [], []
save_data = []

In [None]:
for idx, each in enumerate(list_20per_miss):
    for key, value in list_20per_miss[idx].items():
        if key == "cat_cols":
            continue
        dae_fill, vae_fill,  label_set = fill_missing_split_ae_method(value)
        acc_dae, f1_dae = result_each_method(model_for_classification(len(set(label_set[0]))), dae_fill, label_set)
        acc_vae, f1_vae = result_each_method(model_for_classification(len(set(label_set[0]))), (vae_fill[0][0], vae_fill[1][0]), label_set)
        obj = {"Data": idx, "type": key, "DAE": (acc_dae, f1_dae), "VAE": (acc_vae, f1_vae)}
        save_data.append((dae_fill, vae_fill))
        list_20_res_ae.append(obj)
    #40% missng data
    for key, value in list_40per_miss[idx].items():
        if key == "cat_cols":
            continue
        dae_fill, vae_fill,  label_set = fill_missing_split_ae_method(value)
        acc_dae, f1_dae = result_each_method(model_for_classification(len(set(label_set[0]))), dae_fill, label_set)
        acc_vae, f1_vae = result_each_method(model_for_classification(len(set(label_set[0]))), (vae_fill[0][0], vae_fill[1][0]), label_set)
        obj = {"Data": idx, "type": key, "DAE": (acc_dae, f1_dae), "VAE": (acc_vae, f1_vae)}
        save_data.append((dae_fill, vae_fill))
        list_40_res_ae.append(obj)
    #60% missing data
    for key, value in list_60per_miss[idx].items():
        if key == "cat_cols":
            continue
        dae_fill, vae_fill,  label_set = fill_missing_split_ae_method(value)
        acc_dae, f1_dae = result_each_method(model_for_classification(len(set(label_set[0]))), dae_fill, label_set)
        acc_vae, f1_vae = result_each_method(model_for_classification(len(set(label_set[0]))), (vae_fill[0][0], vae_fill[1][0]), label_set)
        obj = {"Data": idx, "type": key, "DAE": (acc_dae, f1_dae), "VAE": (acc_vae, f1_vae)}
        save_data.append((dae_fill, vae_fill))
        list_60_res_ae.append(obj)

In [None]:
with open("Result/result_per60_ae.json.json", "w") as outfile:
    json.dump(list_60_res_ae, outfile)

In [None]:
train_data = StandardScaler().fit_transform(df.values)

# DuoLoss AE

In [None]:
def build_model_ae(num_class, input_shape):
    def buil_model_tuner_duoloss(hp):
        model_duoLoss = DuoLossAE.compile_model(DuoLossAE.create_model(num_class=num_class,
                                                                      input_size=input_shape, 
                                                                      num_layer=hp.Choice('num_layer', [2, 3, 4], default=3, ordered=True),
                                                                      drop_out_rate=hp.Choice('dr_rate', [0.4, 0.5], default=0.5, ordered=True),
                                                                      theta=hp.Choice('theta', [5,7,9], default=7, ordered=True),
                                                                      activation_func=hp.Choice('acf', ['tanh', 'relu'], default='tanh', ordered=False),
                                                                      alpha=hp.Choice('alp', [0.4, 0.5, 0.6, 0.7], default=0.5, ordered=True)))
        return model_duoLoss
    tuner = kt.BayesianOptimization(
                buil_model_tuner_duoloss,
                objective=kt.Objective("total_loss", direction="min"),
                max_trials=30
              )
    return tuner

In [None]:
def duo_method(df, label='label'):
    label_ = df[label]
    df = df.drop(columns=[label])
    X_train, X_test, y_train, y_test = train_test_split(df, label_, test_size=0.2, random_state=209)
    for each in X_train.columns.to_list():
        X_train[each] = X_train[each].fillna(X_train[each].mean())
        X_test[each] = X_test[each].fillna(X_train[each].mean())
    tuner_duoLoss = build_model_ae(len(set(label_)), X_train.shape[1])
    tuner_duoLoss.search(X_train.values, y_train, epochs=120, batch_size=32,
                         callbacks=[EarlyStopping(monitor='total_loss', patience=20, mode='min'), 
                                    ReduceLROnPlateau(monitor='total_loss', factor=0.8, patience=5, mode='min')], verbose=0)
    best_model_dae = tuner_duoLoss.get_best_models(1)[0]
    pred = np.argmax(best_model_dae.predict(X_test)[1], axis=1)
    return accuracy_score(y_test, pred), f1_score(y_test, pred, average="macro")

In [None]:
list_20_res_duo, list_40_res_duo, list_60_res_duo = [], [], []

In [None]:
for idx, each in enumerate(list_20per_miss):
    for key, value in list_20per_miss[idx].items():
        if key == "cat_cols":
            continue
        acc_duo, f1_duo = duo_method(value)
        obj = {"Data": idx, "type": key, "DAE_DUO": (acc_duo, f1_duo)}
        list_20_res_duo.append(obj)
    #40% missng data
    for key, value in list_40per_miss[idx].items():
        if key == "cat_cols":
            continue
        acc_duo, f1_duo = duo_method(value)
        obj = {"Data": idx, "type": key, "DAE_DUO": (acc_duo, f1_duo)}
        list_40_res_duo.append(obj)
    #60% missing data
    for key, value in list_60per_miss[idx].items():
        if key == "cat_cols":
            continue
        acc_duo, f1_duo = duo_method(value)
        obj = {"Data": idx, "type": key, "DAE_DUO": (acc_duo, f1_duo)}
        list_60_res_duo.append(obj)

In [None]:
with open("Result/result_per60_duo.json", "w") as outfile:
    json.dump(list_60_res_duo, outfile)

In [None]:
list_40_res_duo

# Tổng hợp kết quả

In [None]:
res_20, res_40, res_60 = json.load(open("Result/result_per20.json")), json.load(open("Result/result_per40.json")), json.load(open("Result/result_per60.json"))
res_20_ae, res_40_ae, res_60_ae = json.load(open("Result/result_per20_ae.json")), json.load(open("Result/result_per40_ae.json")), json.load(open("Result/result_per60_ae.json"))
res_20_duo, res_40_duo, res_60_duo = json.load(open("Result/result_per20_duo.json")), json.load(open("Result/result_per40_duo.json")), json.load(open("Result/result_per60_duo.json"))

In [None]:
for idx, each in enumerate(res_20):
    for method in list(res_20_ae[idx].keys()):
        if method not in ["Data", "type"]:
            res_20[idx][method] = (res_20_ae[idx][method][0], res_20_ae[idx][method][1])
    for method in list(res_40_ae[idx].keys()):
        if method not in ["Data", "type"]:
            res_40[idx][method] = (res_40_ae[idx][method][0], res_40_ae[idx][method][1])
    for method in list(res_40_ae[idx].keys()):
        if method not in ["Data", "type"]:
            res_60[idx][method] = (res_60_ae[idx][method][0], res_60_ae[idx][method][1])

In [None]:
def create_df(res_1, typ="mcar"):
    dict_data = {0: "BC", 1: "CE", 2: "CMC", 3: "IM", 4: "TTT"}
    list_data, list_method, uniform_acc, uniform_f1, random_acc, random_f1 = [], [], [], [], [], []
    for index, each in enumerate(res_1):
        if typ == "mcar":
            t1, t2 = 0, 1
        else:
            t1, t2 = 2, 3
        if index % 4 == t1:
            for method in list(each.keys()):
                if method not in ["Data", "type"]:
                    list_data.append(dict_data[each["Data"]])
                    list_method.append(method)
                    uniform_acc.append(round(each[method][0], 3))
                    uniform_f1.append(round(each[method][1], 3))
        elif index % 4 == t2:
            for method in list(each.keys()):
                if method not in ["Data", "type"]:
                    random_acc.append(round(each[method][0], 3))
                    random_f1.append(round(each[method][1], 3))
        else:
            continue
    return pd.DataFrame(
        data = {
            "Dữ liệu": list_data,
            "Phương pháp": list_method,
            "Uniform_acc": uniform_acc,
            "Uniform_f1": uniform_f1,
            "Random_acc": random_acc, 
            "Random_f1": random_f1
        }
    )

In [None]:
create_df(res_20).to_csv("ResultCSV/20_per_mcar.csv", index=False)
create_df(res_20, typ="mar").to_csv("ResultCSV/20_per_mar.csv", index=False)
create_df(res_40).to_csv("ResultCSV/40_per_mcar.csv", index=False)
create_df(res_40, typ="mar").to_csv("ResultCSV/40_per_mar.csv", index=False)
create_df(res_60).to_csv("ResultCSV/60_per_mcar.csv", index=False)
create_df(res_60, typ="mar").to_csv("ResultCSV/60_per_mar.csv", index=False)
# create_df(res_duo, typ="mar").to_csv("ResultCSV/60_per_mar.csv", index=False)

In [None]:
def get_count_best(res, typ=0): 
    dict_max_count_mar = {"KNN": 0, "MissForest": 0, "MICE": 0, "DAE": 0, "VAE": 0}
    dict_max_count_mcar = {"KNN": 0, "MissForest": 0, "MICE": 0, "DAE": 0, "VAE": 0}
    for idx, each in enumerate(res):
        max_value = 0
        for method in list(each.keys()):
            if method not in ["Data", "type"]:
                if each[method][typ] > max_value: 
                    max_value = each[method][typ]
        for method in list(each.keys()):
            if method not in ["Data", "type"]:
                if each[method][typ] == max_value:
                    if "mnar" in each["type"]:
                        if method == "RDF":
                            dict_max_count_mar["MissForest"] += 1
                            continue
                        dict_max_count_mar[method] += 1
                    else:
                        if method == "RDF":
                            dict_max_count_mcar["MissForest"] += 1
                            continue
                        dict_max_count_mcar[method] += 1
    return dict_max_count_mar, dict_max_count_mcar

In [None]:
dict_max_count_20_mar, dict_max_count_20_mcar = get_count_best(res_20)
dict_max_count_40_mar, dict_max_count_40_mcar = get_count_best(res_40)
dict_max_count_60_mar, dict_max_count_60_mcar = get_count_best(res_60)

dict_max_count_20_mar_f1, dict_max_count_20_mcar_f1 = get_count_best(res_20, 1)
dict_max_count_40_mar_f1, dict_max_count_40_mcar_f1 = get_count_best(res_40, 1)
dict_max_count_60_mar_f1, dict_max_count_60_mcar_f1 = get_count_best(res_60, 1)

In [None]:
dict_max_count_20_mcar = {"AEE-DAE": 6, "KNN": 2, "MissForest": 1, "MICE": 1, "AEP-DAE": 1}
dict_max_count_20_mar = {"AEE-DAE": 4, "KNN": 1, "MissForest": 2, "MICE": 3, "AEP-DAE": 2}
dict_max_count_20_mcar_f1 = {"AEE-DAE": 6, "KNN": 1, "MissForest": 1, "MICE": 1, "AEP-DAE": 1}
dict_max_count_20_mar_f1 = {"AEE-DAE": 2, "KNN": 1, "MissForest": 3, "MICE": 3, "AEP-DAE": 2}

dict_max_count_40_mcar = {"AEE-DAE": 6, "KNN": 2, "MissForest": 2, "MICE": 1, "AEP-DAE": 2}
dict_max_count_40_mar = {"AEE-DAE": 4, "KNN": 0, "MissForest": 2, "MICE": 3, "AEP-DAE": 3}
dict_max_count_40_mcar_f1 = {"AEE-DAE": 5, "KNN": 3, "MissForest": 2, "MICE": 1, "AEP-DAE": 0}
dict_max_count_40_mar_f1 = {"AEE-DAE": 5, "KNN": 0, "MissForest": 1, "MICE": 4, "AEP-DAE": 1}

dict_max_count_60_mcar = {"AEE-DAE": 4, "KNN": 2, "MissForest": 1, "MICE": 2, "AEP-DAE": 2}
dict_max_count_60_mar = {"AEE-DAE": 4, "KNN": 3, "MissForest": 3, "MICE": 1, "AEP-DAE": 0}
dict_max_count_60_mcar_f1 = {"AEE-DAE": 5, "KNN": 2, "MissForest": 1, "MICE": 1, "AEP-DAE": 2}
dict_max_count_60_mar_f1 = {"AEE-DAE": 4, "KNN": 2, "MissForest": 3, "MICE": 1, "AEP-DAE": 1}

In [None]:
plt.rcParams.update({'font.size': 16})

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(15, 20))
X_axis = np.arange(len(list(dict_max_count_20_mcar.keys())))
ax[0][0].bar(X_axis - 0.2, np.array(list(dict_max_count_20_mcar.values()), dtype="int"), 0.4, label="Accuracy", color="blue")
ax[0][0].bar(X_axis + 0.2, np.array(list(dict_max_count_20_mcar_f1.values()), dtype="int"), 0.4, label="F1_score", color="red")
ax[0][0].set_xticks(X_axis, list(dict_max_count_20_mcar.keys()))
ax[0][0].legend()
ax[0][0].set_title("MCAR")
X_axis = np.arange(len(list(dict_max_count_20_mar.keys())))
ax[0][1].bar(X_axis - 0.2, np.array(list(dict_max_count_20_mar.values()), dtype="int"), 0.4, label="Accuracy", color="blue")
ax[0][1].bar(X_axis + 0.2, np.array(list(dict_max_count_20_mar_f1.values()), dtype="int"), 0.4, label="F1_score", color="red")
ax[0][1].set_xticks(X_axis, list(dict_max_count_20_mar.keys()))
ax[0][1].legend()
ax[0][1].set_title("MAR")
X_axis = np.arange(len(list(dict_max_count_40_mcar.keys())))
ax[1][0].bar(X_axis - 0.2, np.array(list(dict_max_count_40_mcar.values()), dtype="int"), 0.4, label="Accuracy", color="blue")
ax[1][0].bar(X_axis + 0.2, np.array(list(dict_max_count_40_mcar_f1.values()), dtype="int"), 0.4, label="F1_score", color="red")
ax[1][0].set_xticks(X_axis, list(dict_max_count_40_mcar.keys()))
ax[1][0].legend()
ax[1][0].set_title("MCAR")
X_axis = np.arange(len(list(dict_max_count_40_mar.keys())))
ax[1][1].bar(X_axis - 0.2, np.array(list(dict_max_count_40_mar.values()), dtype="int"), 0.4, label="Accuracy", color="blue")
ax[1][1].bar(X_axis + 0.2, np.array(list(dict_max_count_40_mar_f1.values()), dtype="int"), 0.4, label="F1_score", color="red")
ax[1][1].set_xticks(X_axis, list(dict_max_count_40_mar.keys()))
ax[1][1].legend()
ax[1][1].set_title("MAR")
X_axis = np.arange(len(list(dict_max_count_60_mcar.keys())))
ax[2][0].bar(X_axis - 0.2, np.array(list(dict_max_count_60_mcar.values()), dtype="int"), 0.4, label="Accuracy", color="blue")
ax[2][0].bar(X_axis + 0.2, np.array(list(dict_max_count_60_mcar_f1.values()), dtype="int"), 0.4, label="F1_score", color="red")
ax[2][0].set_xticks(X_axis, list(dict_max_count_60_mcar.keys()))
ax[2][0].legend()
ax[2][0].set_title("MCAR")
X_axis = np.arange(len(list(dict_max_count_60_mar.keys())))
ax[2][1].bar(X_axis - 0.2, np.array(list(dict_max_count_60_mar.values()), dtype="int"), 0.4, label="Accuracy", color="blue")
ax[2][1].bar(X_axis + 0.2, np.array(list(dict_max_count_60_mar_f1.values()), dtype="int"), 0.4, label="F1_score", color="red")
ax[2][1].set_xticks(X_axis, list(dict_max_count_60_mar.keys()))
ax[2][1].legend()
ax[2][1].set_title("MAR")