In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

In [3]:
from time import time
from tqdm import tqdm
import lightgbm as lgb
from pandas.api.types import is_categorical_dtype, is_numeric_dtype
from sklearn.metrics import mean_squared_error

class BaseMICE:
    """Base class for the MICE implementation."""
    
    def __init__(self, max_iter=10, previous_loss = 0):
        self.max_iter = max_iter
        self.previous_loss = previous_loss
    
    def fill_missing_values(self, df):
        """Fills the missing values of a pandas DataFrame.
        
        Parameters
        ----------
        df : pandas.DataFrame
            Input data with missing values (nans).
            
        Returns
        -------
        pandas.DataFrame
            DataFrame with imputed missing values.
        """
        nan_ids = np.argwhere(df.isna().values).tolist()
        df_imputed = self.impute_initial_mean_or_mode(df)
        iter_results = []
        for iter in range(self.max_iter):
            df_imputed = self.transform(df_imputed, nan_ids, iter)
        return df_imputed
    
    def benchmark(self, df_original, df_missing, drop_columns_loss=None):
        """Benchmarks the fill method for missing values.
        
        Parameters
        ----------
        df_original : pandas.DataFrame
            Original data.
        df_missing : pandas.DataFrame
            Input data with missing values (nans).
        drop_columns_loss : list, optional
            Drop columns in the result DataFrame when 
            computing the loss.
            
        Returns
        -------
        pandas.DataFrame
            DataFrame with imputed missing values.
        """
        columns_missing = df_missing.isna().sum()
        columns_missing = columns_missing[columns_missing > 0]
        nan_ids = np.argwhere(df_missing.isna().values).tolist()
        df_imputed = self.impute_initial_mean_or_mode(df_missing)
        self.df_mean = df_imputed.copy()
        
        iter_results = []
        for iter in range(self.max_iter):
            time_start = time()
            df_imputed = self.transform(df_imputed, columns_missing, nan_ids, iter)
            time_stop = time() - time_start
            if drop_columns_loss:
                loss = self.compute_loss(df_original.drop(columns=drop_columns_loss, axis=1), 
                                         df_imputed.drop(columns=drop_columns_loss, axis=1))
            else:
                loss = self.compute_loss(df_original, df_imputed)
            if self.previous_loss == 0:
                self.previous_loss = loss
                df_imputed.to_csv("imputed_df.csv", index = False)
            elif self.previous_loss < loss:
                self.previous_loss = loss
                df_imputed.to_csv("imputed_df.csv", index = False)
            iter_results.append({
                "iter": iter,
                "time_seconds": time_stop, 
                "loss": loss
            })
        return iter_results
    
    def benchmark_mean_loss(self, df_original, df_missing, drop_columns_loss=None):
        """Computes the same iterations as benchmark() but only for the of the mean imputation method."""
        time_start = time()
        df_imputed = self.impute_initial_mean_or_mode(df_missing)
        time_stop = time() - time_start
        
        iter_results = []
        for iter in range(self.max_iter):
            if drop_columns_loss:
                loss = self.compute_loss(df_original.drop(columns=drop_columns_loss, axis=1), 
                                         df_imputed.drop(columns=drop_columns_loss, axis=1))
            else:
                loss = self.compute_loss(df_original, df_imputed)
            iter_results.append({
                "iter": iter,
                "time_seconds": time_stop, 
                "loss": loss
            })
        return iter_results
    
    def get_model(self, target):
        if is_numeric_dtype(target):
            model = lgb.LGBMRegressor()
        else:
            model = lgb.LGBMClassifier()
        return model
    
    def compute_loss(self, original_df, filled_df):
        """Computes the difference between the original and filled DataFrames."""
        return mean_squared_error(original_df, filled_df)
    
    def impute_initial_mean_or_mode(self, df):
        df_new = df.copy()
        for column in df:
            if is_numeric_dtype(df[column]):
                df_new[column] = df_new[column].fillna(df_new[column].mean())
            else:
                df_new[column] = df_new[column].fillna(df_new[column].mode())
        return df_new
    
    def transform(self, df, nan_ids):
        pass

In [4]:
class VanilaMICE(BaseMICE):
    """MICE implementation using value by value imputation."""
    
    method_name = "Vanila MICE"
    
    def transform(self, df: pd.DataFrame, columns_missing: list, nan_ids: list, iter_id: int):
        random_ids = np.random.permutation(len(nan_ids)).tolist()
        for id in tqdm(random_ids, desc=f"{self.method_name}: Iter {iter_id + 1} / {self.max_iter}", position=0):
            # Setup data
            row_id, col_id = nan_ids[id]
            target_column_name = df.columns[col_id]
            X = df.drop(columns=[target_column_name], axis=1)
            X = pd.get_dummies(X, drop_first=True)
            y = df[target_column_name]
            
            # Fit model
            model = self.get_model(y).fit(X.drop(index=row_id), y.drop(index=row_id))
            
            # Predict value
            df.iloc[row_id, col_id] = model.predict(X.iloc[row_id:row_id + 1, :])
        return df

In [47]:
#dataNames = ['vals_test_df', 'vals_test_df_test_type1', 'vals_test_df_test_type2']

dataNames = ['vals_test_df_test_type2']
for dataName in dataNames:
    file_name = dataName+".csv"
    if dataName =='vals_test_df':
        train_dataName = 'vals_train_df'+'.csv'
    elif dataName =='vals_test_df_test_type1':
        train_dataName = 'vals_train_df_test_type1'+'.csv'
    elif dataName =='vals_test_df_test_type2':
        train_dataName = 'vals_train_df_test_type2'+'.csv'
    load_test_cataglogue_data = pd.read_csv(file_name, header = None, skiprows=1 )
    load_train_cataglogue_data = pd.read_csv(train_dataName, header = None, skiprows=1 )
    load_cataglogue_data = pd.concat([load_train_cataglogue_data, load_test_cataglogue_data], ignore_index= True)
    print(load_cataglogue_data.shape)
    load_cataglogue_data.to_csv("original_catalogue.csv", index = False)
    #catalogue_features_df = load_cataglogue_data[['1', '2', '3', '4', '5', '6', '7', '8', '9']]
    catalogue_features_df = pd.read_csv("original_catalogue.csv", usecols = [*range(1, 10)], header=None, skiprows=1)
    print(catalogue_features_df.shape)
    #catalogue_target_df = load_cataglogue_data[['0']]
    catalogue_target_df = pd.read_csv("original_catalogue.csv", usecols = [0], header=None, skiprows=1)
    print(catalogue_target_df.shape)
    
    #show_sample(catalogue_features_df, catalogue_target_df)
    max_iter = 10
    results = []
    # setup data
    features_df, targets_df = catalogue_features_df, catalogue_target_df
    original_df = pd.concat([catalogue_features_df,catalogue_target_df], axis=1)

    #print(original_df.shape)
    missing_test_file_name = dataName + "_generated.csv"
    missing_test_df_x = pd.read_csv(missing_test_file_name, header= None, skiprows = 1,  usecols = [*range(1, 10)])
    missing_test_df_y = pd.read_csv(missing_test_file_name, header= None, skiprows = 1,  usecols = [0])
    missing_test_df = pd.concat([missing_test_df_x,missing_test_df_y], axis= 1)
    load_train_cataglogue_data_x = pd.read_csv(train_dataName, header = None, skiprows=1,  usecols = [*range(1, 10)] )
    load_train_cataglogue_data_y = pd.read_csv(train_dataName, header = None, skiprows=1,  usecols = [0] )
    load_train_cataglogue_data = pd.concat([load_train_cataglogue_data_x, load_train_cataglogue_data_y], axis= 1)
    nans_df = pd.concat([load_train_cataglogue_data,missing_test_df], ignore_index= True)
    print(nans_df.shape)

      
    
    # Compute mean squared error loss
    mean_iters = BaseMICE(max_iter).benchmark_mean_loss(original_df, nans_df, drop_columns_loss=None)
    vanila_iters = VanilaMICE(max_iter).benchmark(original_df, nans_df, drop_columns_loss=None)


    results.append({
        "dataset": dataName,
        "results": {
            "mean": mean_iters,
            "vanila_mice": vanila_iters
        }
    })

Vanila MICE: Iter 1 / 10:   0%|                                                                | 0/666 [00:00<?, ?it/s]

(1311, 10)
(1311, 9)
(1311, 1)
(1311, 10)


Vanila MICE: Iter 1 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:55<00:00,  5.75it/s]
Vanila MICE: Iter 2 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:56<00:00,  5.71it/s]
Vanila MICE: Iter 3 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:57<00:00,  5.65it/s]
Vanila MICE: Iter 4 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:57<00:00,  5.65it/s]
Vanila MICE: Iter 5 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:57<00:00,  5.66it/s]
Vanila MICE: Iter 6 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [02:01<00:00,  5.47it/s]
Vanila MICE: Iter 7 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:56<00:00,  5.73it/s]
Vanila MICE: Iter 8 / 10: 100%|██████████████████████████████████████████████████████| 666/666 [01:57<00:00,  5.69it/s]
Vanila MICE: Iter 9 / 10: 100%|█████████

In [48]:
results_by_dataset = {}
for dataset in results:
    name = dataset['dataset']
    
    methods = []
    for method in dataset['results']:
        num_iters = sum([1 for iter in dataset['results'][method]])
        total_time = sum([iter["time_seconds"] for iter in dataset['results'][method]])
        loss = np.mean([iter["loss"] for iter in dataset['results'][method]])
        methods.append([name, method, num_iters, round(total_time), loss])
    results_by_dataset[name] = pd.DataFrame(methods, columns=['dataset', 'method', 'num_iters', 'total_time', 'loss'])

print(results_by_dataset)

{'vals_test_df_test_type2':                    dataset       method  num_iters  total_time      loss
0  vals_test_df_test_type2         mean         10           0  0.042994
1  vals_test_df_test_type2  vanila_mice         10        1171  0.010170}


In [49]:
dataName = 'vals_test_df'
imputed_df_x = pd.read_csv("imputed_df.csv", header= None, skiprows = 1, usecols = [*range(0, 9)])
imputed_df_y = pd.read_csv("imputed_df.csv", header= None, skiprows = 1, usecols = [9])
imputed_df = pd.concat([imputed_df_y, imputed_df_x], axis=1)
if dataName ==  'vals_test_df':
    MICE_imputed_df = imputed_df.iloc[918:1311, :]
elif dataName ==  'vals_test_df_test_type1':
    MICE_imputed_df = imputed_df.iloc[495:1311, :]
elif dataName ==  'vals_test_df_test_type2':
    MICE_imputed_df = imputed_df.iloc[816:1311, :]

MICE_imputed_df.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
MICE_imputed_df.to_csv("MICE_imputated_catalogueData1.csv", index = False)

In [14]:
features_df, targets_df = catalogue_features_df, catalogue_target_df
original_df = pd.concat([catalogue_features_df,catalogue_target_df], axis=1)
print(original_df.shape)
missing_test_df_x = pd.read_csv("vals_test_df_generated.csv", header= None, skiprows = 1,  usecols = [*range(1, 10)])
missing_test_df_y = pd.read_csv("vals_test_df_generated.csv", header= None, skiprows = 1,  usecols = [0])
missing_test_df = pd.concat([missing_test_df_x,missing_test_df_y], axis= 1)
load_train_cataglogue_data_x = pd.read_csv("vals_train_df.csv", header = None, skiprows=1,  usecols = [*range(1, 10)] )
load_train_cataglogue_data_y = pd.read_csv("vals_train_df.csv", header = None, skiprows=1,  usecols = [0] )
load_train_cataglogue_data = pd.concat([load_train_cataglogue_data_x, load_train_cataglogue_data_y], axis= 1)
nans_df = pd.concat([load_train_cataglogue_data,missing_test_df])
#nans_df = pd.concat([nans_df, catalogue_target_df], axis = 1)
print(nans_df.shape)
print(original_df.head(5))
print(nans_df.head(5))
print(catalogue_target_df.shape)

(1311, 10)
(1311, 10)
          1          2          3          4         5         6         7  \
0 -0.073577  12.072797  13.743570  10.600228  4.976427 -2.036147 -1.714266   
1 -0.130407  -0.582912  -0.560864  -0.312900 -0.421307  1.085241  1.102540   
2 -0.100489  -0.265857  -0.325232  -0.180335 -0.423785  1.479124  1.510337   
3 -0.095435  -0.432822  -0.343848  -0.151197 -0.174168  0.280763  0.246145   
4 -0.076036  -0.036002   0.054097  -0.050561  0.157918 -0.322421 -0.384505   

          8         9         0  
0 -1.763495 -1.725079  0.169560  
1  1.155726  1.171365  0.538615  
2  1.341343  1.251914  0.909825  
3  0.182350  0.246956  0.271896  
4 -0.460758 -0.515073  0.237271  
          1          2          3          4         5         6         7  \
0 -0.073577  12.072797  13.743570  10.600228  4.976427 -2.036147 -1.714266   
1 -0.130407  -0.582912  -0.560864  -0.312900 -0.421307  1.085241  1.102540   
2 -0.100489  -0.265857  -0.325232  -0.180335 -0.423785  1.479124  1.510

In [13]:
from time import time
from tqdm import tqdm
import lightgbm as lgb
from pandas.api.types import is_categorical_dtype, is_numeric_dtype
from sklearn.metrics import mean_squared_error

class BaseMICE:
    """Base class for the MICE implementation."""
    
    def __init__(self, max_iter=10):
        self.max_iter = max_iter
    
    def fill_missing_values(self, df):
        """Fills the missing values of a pandas DataFrame.
        
        Parameters
        ----------
        df : pandas.DataFrame
            Input data with missing values (nans).
            
        Returns
        -------
        pandas.DataFrame
            DataFrame with imputed missing values.
        """
        nan_ids = np.argwhere(df.isna().values).tolist()
        df_imputed = self.impute_initial_mean_or_mode(df)
        iter_results = []
        for iter in range(self.max_iter):
            df_imputed = self.transform(df_imputed, nan_ids, iter)
        return df_imputed
    
    def benchmark(self, df_original, df_missing, drop_columns_loss=None):
        """Benchmarks the fill method for missing values.
        
        Parameters
        ----------
        df_original : pandas.DataFrame
            Original data.
        df_missing : pandas.DataFrame
            Input data with missing values (nans).
        drop_columns_loss : list, optional
            Drop columns in the result DataFrame when 
            computing the loss.
            
        Returns
        -------
        pandas.DataFrame
            DataFrame with imputed missing values.
        """
        columns_missing = df_missing.isna().sum()
        columns_missing = columns_missing[columns_missing > 0]
        nan_ids = np.argwhere(df_missing.isna().values).tolist()
        df_imputed = self.impute_initial_mean_or_mode(df_missing)
        self.df_mean = df_imputed.copy()
        
        iter_results = []
        for iter in range(self.max_iter):
            time_start = time()
            #print(columns_missing)
            #print(nan_ids)
            #print(df_imputed.shape)
            df_imputed = self.transform(df_imputed, columns_missing, nan_ids, iter)
            time_stop = time() - time_start
            previous_loss = 0
            if drop_columns_loss:
                loss = self.compute_loss(df_original.drop(columns=drop_columns_loss, axis=1), 
                                         df_imputed.drop(columns=drop_columns_loss, axis=1))
            else:
                loss = self.compute_loss(df_original, df_imputed)
            if previous_loss == 0:
                previous_loss = loss
                df_imputed.to_csv("imputed_df.csv", index = False)
            elif previous_loss < loss:
                previous_loss = loss
                df_imputed.to_csv("imputed_df.csv", index = False)
            iter_results.append({
                "iter": iter,
                "time_seconds": time_stop, 
                "loss": loss
            })
        return iter_results
    
    def benchmark_mean_loss(self, df_original, df_missing, drop_columns_loss=None):
        """Computes the same iterations as benchmark() but only for the of the mean imputation method."""
        time_start = time()
        df_imputed = self.impute_initial_mean_or_mode(df_missing)
        time_stop = time() - time_start
        
        iter_results = []
        for iter in range(self.max_iter):
            if drop_columns_loss:
                loss = self.compute_loss(df_original.drop(columns=drop_columns_loss, axis=1), 
                                         df_imputed.drop(columns=drop_columns_loss, axis=1))
            else:
                loss = self.compute_loss(df_original, df_imputed)
            iter_results.append({
                "iter": iter,
                "time_seconds": time_stop, 
                "loss": loss
            })
        return iter_results
    
    def get_model(self, target):
        if is_numeric_dtype(target):
            model = lgb.LGBMRegressor()
        else:
            model = lgb.LGBMClassifier()
        return model
    
    def compute_loss(self, original_df, filled_df):
        """Computes the difference between the original and filled DataFrames."""
        return mean_squared_error(original_df, filled_df)
    
    def impute_initial_mean_or_mode(self, df):
        df_new = df.copy()
        for column in df:
            if is_numeric_dtype(df[column]):
                df_new[column] = df_new[column].fillna(df_new[column].mean())
            else:
                df_new[column] = df_new[column].fillna(df_new[column].mode())
        return df_new
    
    def transform(self, df, nan_ids):
        pass
    

class VanilaMICE(BaseMICE):
    """MICE implementation using value by value imputation."""
    
    method_name = "Vanila MICE"
    
    def transform(self, df: pd.DataFrame, columns_missing: list, nan_ids: list, iter_id: int):
        random_ids = np.random.permutation(len(nan_ids)).tolist()
        #print("random", random_ids)
        for id in tqdm(random_ids, desc=f"{self.method_name}: Iter {iter_id + 1} / {self.max_iter}", position=0):
            # Setup data
            row_id, col_id = nan_ids[id]
            target_column_name = df.columns[col_id]
            X = df.drop(columns=[target_column_name], axis=1)
            X = pd.get_dummies(X, drop_first=True)
            y = df[target_column_name]
            
            # Fit model
            model = self.get_model(y).fit(X.drop(index=row_id), y.drop(index=row_id))
            
            # Predict value
            df.iloc[row_id, col_id] = model.predict(X.iloc[row_id:row_id + 1, :])
        return df
    
    
#show_sample(catalogue_features_df, catalogue_target_df)
max_iter = 5
results = []
# setup data
features_df, targets_df = catalogue_features_df, catalogue_target_df
original_df = pd.concat([catalogue_features_df,catalogue_target_df], axis=1)
#original_df.to_csv("original_df.csv", index = True)
print(original_df.shape)
missing_test_df_x = pd.read_csv("vals_test_df_generated.csv", header= None, skiprows = 1,  usecols = [*range(1, 10)])
missing_test_df_y = pd.read_csv("vals_test_df_generated.csv", header= None, skiprows = 1,  usecols = [0])
missing_test_df = pd.concat([missing_test_df_x,missing_test_df_y], axis= 1)
load_train_cataglogue_data_x = pd.read_csv("vals_train_df.csv", header = None, skiprows=1,  usecols = [*range(1, 10)] )
load_train_cataglogue_data_y = pd.read_csv("vals_train_df.csv", header = None, skiprows=1,  usecols = [0] )
load_train_cataglogue_data = pd.concat([load_train_cataglogue_data_x, load_train_cataglogue_data_y], axis= 1)
nans_df = pd.concat([load_train_cataglogue_data,missing_test_df], ignore_index= True)
#nans_df.to_csv("nans_df.csv", index = True)
#nans_df = pd.concat([nans_df, catalogue_target_df], axis = 1)
print(nans_df.shape)
#print(original_df.tail(5))
#print(nans_df.tail(5))
#print(catalogue_target_df.shape)
#original_df, nans_df = generate_missing_values(features_df, targets_df)
      
    
# Compute mean squared error loss 
mean_iters = BaseMICE(max_iter).benchmark_mean_loss(original_df, nans_df, drop_columns_loss=None)
vanila_iters = VanilaMICE(max_iter).benchmark(original_df, nans_df, drop_columns_loss=None)
#fast_iters = FastMICE(max_iter).benchmark(original_df, nans_df, drop_columns_loss=None)
#slow_fast_iters = SlowFastMICE(max_iter).benchmark(original_df, nans_df, drop_columns_loss=None)
#fast_slow_iters = FastSlowMICE(max_iter).benchmark(original_df, nans_df, drop_columns_loss=None)

results.append({
    "dataset": "Catalogue",
    "results": {
        "mean": mean_iters,
        "vanila_mice": vanila_iters
        #"fast_mice": fast_iters,
        #"slow_fast_mice": slow_fast_iters,
        #"fast_slow_mice": fast_slow_iters
    }
})

Vanila MICE: Iter 1 / 5:   0%|                                                                 | 0/686 [00:00<?, ?it/s]

(1311, 10)
(1311, 10)


Vanila MICE: Iter 1 / 5: 100%|███████████████████████████████████████████████████████| 686/686 [03:24<00:00,  3.36it/s]
Vanila MICE: Iter 2 / 5: 100%|███████████████████████████████████████████████████████| 686/686 [03:23<00:00,  3.37it/s]
Vanila MICE: Iter 3 / 5: 100%|███████████████████████████████████████████████████████| 686/686 [03:29<00:00,  3.28it/s]
Vanila MICE: Iter 4 / 5: 100%|███████████████████████████████████████████████████████| 686/686 [03:15<00:00,  3.50it/s]
Vanila MICE: Iter 5 / 5: 100%|███████████████████████████████████████████████████████| 686/686 [03:32<00:00,  3.22it/s]


In [14]:
results_by_dataset = {}
for dataset in results:
    name = dataset['dataset']
    
    methods = []
    for method in dataset['results']:
        num_iters = sum([1 for iter in dataset['results'][method]])
        total_time = sum([iter["time_seconds"] for iter in dataset['results'][method]])
        loss = np.mean([iter["loss"] for iter in dataset['results'][method]])
        methods.append([name, method, num_iters, round(total_time), loss])
    results_by_dataset[name] = pd.DataFrame(methods, columns=['dataset', 'method', 'num_iters', 'total_time', 'loss'])

In [15]:
print(results_by_dataset)

{'Catalogue':      dataset       method  num_iters  total_time      loss
0  Catalogue         mean          5           0  0.042200
1  Catalogue  vanila_mice          5        1026  0.005109}


In [25]:
dataName = 'vals_test_df'
imputed_df_x = pd.read_csv("imputed_df.csv", header= None, skiprows = 1, usecols = [*range(0, 9)])
imputed_df_y = pd.read_csv("imputed_df.csv", header= None, skiprows = 1, usecols = [9])
imputed_df = pd.concat([imputed_df_y, imputed_df_x], axis=1)
if dataName ==  'vals_test_df':
    MICE_imputed_df = imputed_df.iloc[918:1311, :]
elif dataName ==  'vals_test_df_test_type1':
    MICE_imputed_df = imputed_df.iloc[495:1311, :]
elif dataName ==  'vals_test_df_test_type2':
    MICE_imputed_df = imputed_df.iloc[816:1311, :]

MICE_imputed_df.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
MICE_imputed_df.to_csv("MICE_imputated_catalogueData1.csv", index = False)