In [1]:
import os
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
import pandas as pd
import math
import glob
import numpy as np
import statistics
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Input: takes single instance of multiple_nan in which distance is greater than thresholds, 
# Output: give the imputed instances

def process(inst,mean_std,class_center):
    
    # we count number of nan in instance
    dummy_df = inst.copy()
    nan_count = dummy_df.isna().sum(axis =1)
    
    # we create number of instances as per the nan_count
    dummy_df = pd.concat([dummy_df]*nan_count.values[0])
    
    # we get the columns in which nan is present 
    cols = dummy_df.columns[dummy_df.isna().any()]
    
    # we replace nan values with the mean_std values to a single cell in an instance
    for i in range(len(cols)):
        dummy_df.iloc[i,cols[i]] = mean_std.iloc[0,cols[i]]
    
    # we impute mean values to rest nan values
    dummy_df.fillna(class_center,inplace = True)
    
    # we find the single instance with shortest distance from the class_center
    dummy_distances = []
    for i in range(len(cols)):
        dummy_distances.append(math.dist(class_center, dummy_df.iloc[i]))
    
    min_index = dummy_distances.index(min(dummy_distances))
    output = dummy_df.iloc[min_index]

    return output

In [3]:
# Input: data and class name
# Output: complete and incomplete dataset of that particular class

def data_split(data,c):
    c_data = data[data[data.shape[1]-1]==c].dropna()
    id = data[data[data.shape[1]-1]==c].isnull().any(axis =1)
    ic_data = data[data[data.shape[1]-1] == c][id]
    
    return c_data,ic_data

In [4]:
fetched_original_data = pd.read_excel(r'give here original data location',header = None)
fetched_data = pd.read_excel('give here location of data that has missing values',header = None)

In [5]:
enc = ce.ordinal.OrdinalEncoder(handle_missing = 'return_nan')

In [6]:
fetched_original_data = enc.fit_transform(fetched_original_data)
fetched_data = enc.transform(fetched_data)

In [7]:
original_data = fetched_original_data.copy()
data = fetched_data.copy()

In [8]:
AE_values = []
class_list = np.unique(data[data.shape[1]-1])

In [9]:
final_imputed_data = data.copy()
final_imputed_data = final_imputed_data.iloc[0:0]

for i in class_list:
    
    #-----------------------------first algoritms---------------------------------------------------------------------#
    
    # splitting the data into complete and incomplete data based on class
    globals()["complete_data_"+str(i)],globals()["incomplete_data_"+str(i)] = data_split(data,i)
    
    #----------------------------sending the complete data to the final imputed data----------------------------------#
    final_imputed_data = final_imputed_data.append(globals()["complete_data_"+str(i)])
    
    # class center and sandard deviation
    class_center = globals()["complete_data_"+str(i)].mean()
    std_deviation = globals()["complete_data_"+str(i)].std()
    
    # threshold calculation
    distances = []
    for j in  globals()["complete_data_"+str(i)].index:
        distances.append(math.dist(class_center,  globals()["complete_data_"+str(i)].loc[j])) 
        
    threshold = statistics.median(distances)
    
    #-----------------------------second algoritms-------------------------------------------------------------------#
    
    # addition of std and mean for the step2 of second algorithm
    mean_list=globals()["complete_data_"+str(i)].mean().tolist()
    std_list=globals()["complete_data_"+str(i)].std().tolist()
    final_list=[mean_list, std_list]
    mean_std=[sum(x) for x in zip(*final_list)]
    mean_std = pd.DataFrame(np.array(mean_std).reshape(-1,len(mean_std)),columns = globals()["complete_data_"+str(i)].columns)

    # splitting the incomplete data into single_nan and multiple_nan
    globals()['single_nan_'+str(i)] = globals()["incomplete_data_"+str(i)][globals()["incomplete_data_"+str(i)].isna().sum(axis =1) ==1]
    globals()['multiple_nan_'+str(i)] = globals()["incomplete_data_"+str(i)][globals()["incomplete_data_"+str(i)].isna().sum(axis =1) >1]
    
    # imputing the class center values to the single_nan and multiple_nan
    globals()['single_nan_imp_'+str(i)]=globals()['single_nan_'+str(i)].fillna(class_center)
    globals()['multiple_nan_imp_'+str(i)]=globals()['multiple_nan_'+str(i)].fillna(class_center)
    
    # comparing the distance with the threshold, if distance greater we use the above mentioned function for the imputation
    # Also a new variable multiple_nan_imp_ created for the new instances
    for j in globals()['multiple_nan_imp_'+str(i)].index:
        new_distance = math.dist(class_center, globals()['multiple_nan_imp_'+str(i)].loc[j])
        if new_distance > threshold:
            globals()['multiple_nan_imp_'+str(i)].loc[j] = process(globals()['multiple_nan_'+str(i)].loc[[j]],mean_std,class_center)

    # comparing the distance with the threshold, if distance greater we use the above mentioned function for the imputation
    # Also a new variable single_nan_imp_ created for the new instances
    for j in globals()['single_nan_imp_'+str(i)].index:
        new_distance = math.dist(class_center, globals()['single_nan_imp_'+str(i)].loc[j])
        if new_distance > threshold:
            globals()['single_nan_imp_'+str(i)].loc[j] = process(globals()['single_nan_'+str(i)].loc[[j]],mean_std,class_center)
    
    #---------------sending the single and multiple nan imputed values data to the final imputed data---------------#
    final_imputed_data = final_imputed_data.append(globals()['single_nan_imp_'+str(i)])
    final_imputed_data = final_imputed_data.append(globals()['multiple_nan_imp_'+str(i)])
    final_imputed_data = final_imputed_data.sort_index()
    final_imputed_data = final_imputed_data.round()
    final_imputed_data = final_imputed_data.astype('int64')
    
    # converting data from numerical to categorical
    final_categorical_data = final_imputed_data.replace([0,1,2],['b','o','x'])
    final_categorical_data.iloc[:,-1] = final_categorical_data.iloc[:,-1].replace(['o','x'],[1,2])
    
    # AE numerator calculation
    numberOfCell = len(original_data.index) * len(original_data.columns)
    a = original_data - final_imputed_data
    numerator = (a == 0).sum()
    numerator = numerator.sum()
    AE = numerator/numberOfCell

AE_values.append(AE)
    
    
    
    
    


In [10]:
AE_values

[0.9987322515212982]