In [23]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler

def gain_imputation(df, niter=10000, batch_size=128, alpha=100, sample_interval=1000):
    # Separate the columns with missing values and non-missing values
    cols_with_missing = df.columns[df.isna().any()].tolist()
    cols_without_missing = df.columns.difference(cols_with_missing).tolist()
    # print(cols_with_missing)
    # print('-----------',cols_without_missing)
    # Min-max scale the data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)

    # Define the generator network
    def build_generator():
        generator_input = Input(shape=(len(cols_without_missing),))
        x = Dense(256, activation='relu')(generator_input)
        x = Dropout(0.2)(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.2)(x)
        x = Dense(len(cols_with_missing), activation='sigmoid')(x)
        generator_output = Concatenate()([generator_input, x])
        generator = Model(generator_input, generator_output)
        return generator

    # Define the discriminator network
    def build_discriminator():
        discriminator_input = Input(shape=(len(df.columns),))
        x = Dense(256, activation='relu')(discriminator_input)
        x = Dropout(0.2)(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.2)(x)
        discriminator_output = Dense(1, activation='sigmoid')(x)
        discriminator = Model(discriminator_input, discriminator_output)
        return discriminator

    # Compile the generator and discriminator networks
    generator = build_generator()
    discriminator = build_discriminator()
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    gan_input = Input(shape=(len(cols_without_missing),))
    gan_output = discriminator(generator(gan_input))
    gan = Model(gan_input, gan_output)
    gan.compile(loss='binary_crossentropy', optimizer='adam')

    # Train the GAIN model
    for i in range(niter):
        # Sample a batch of complete data
        idx = np.random.choice(len(scaled_data), size=batch_size)
        # real_data = scaled_data[idx]
        # Generate real data with the same shape as the discriminator's input
        real_data = np.random.normal(0, 1, size=(batch_size, 13))


        # Sample a batch of incomplete data
        mask = np.random.binomial(1, 1 - alpha, real_data.shape)
        missing_data = real_data.copy()
        missing_data[mask == 0] = np.nan

        # Impute the missing data using the generator network
        imputed_data = generator.predict([real_data[:, cols_without_missing]])

        # Combine the imputed data with the non-missing data
        imputed_data = np.concatenate([real_data[:, cols_without_missing], imputed_data], axis=1)
        # real_data = np.concatenate([real_data[:, cols_without_missing + cols_with_missing], real_data[:, cols_without_missing + cols_with_missing]], axis=1)
        # real_data = np.concatenate([real_data[:, cols_with_missing], real_data[:, cols_with_missing]], axis=1)
        # Convert column names to indices
        real_data_cols = df.columns.tolist()
        cols_with_missing_idx = np.where(np.in1d(real_data_cols, cols_with_missing))[0]
        cols_without_missing_idx = np.where(np.in1d(real_data_cols, cols_without_missing))[0]
        # cols_with_missing_idx = [real_data.columns.get_loc(col) for col in cols_with_missing]

        # Concatenate the columns with missing values
        # real_data_missing = real_data.iloc[:, cols_with_missing_idx]
        real_data_missing = real_data[:, cols_with_missing_idx]
        real_data = np.concatenate([real_data, real_data_missing], axis=1)

        # missing_data = np.concatenate([missing_data[:, cols_without_missing + cols_with_missing], imputed_data[:, cols_without_missing + cols_with_missing]], axis=1)
        missing_data = np.concatenate([missing_data[:, np.concatenate([cols_without_missing_idx, cols_with_missing_idx])], 
                               imputed_data[:, np.concatenate([cols_without_missing_idx, cols_with_missing_idx])]], 
                               axis=1)

        # Train the discriminator network
        discriminator_loss = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
        discriminator_loss += discriminator.train_on_batch(missing_data, np.zeros((batch_size, 1)))

        # Train the generator network
        generator_loss = gan.train_on_batch(real_data[:, cols_without_missing], np.ones((batch_size, 1)))

        # Print the losses and impute some missing data
        if i % sample_interval == 0:
            print('Iteration %d: Generator Loss = %f, Discriminator Loss = %f' % (i, generator_loss, discriminator_loss))
            imputed_data = generator.predict([scaled_data[:, cols_without_missing]])
            imputed_data = np.concatenate([scaled_data[:, cols_without_missing], imputed_data], axis=1)
            imputed_data = scaler.inverse_transform(imputed_data)
            imputed_data = pd.DataFrame(imputed_data, columns=df.columns)

    return imputed_data


In [27]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the Boston Housing dataset
boston = load_boston()
X, y = boston.data, boston.target

# Convert to Pandas DataFrame and add missing values
df = pd.DataFrame(X, columns=boston.feature_names)
df = df.mask(np.random.random(df.shape) < 0.1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Impute missing values using the GAIN function
imputed_data = gain_imputation(X_train, alpha=0.5, niter=10000, batch_size=128, sample_interval=1000)

# Evaluate imputation performance on the test set
imputed_test = imputed_data.loc[X_test.index]
mse = mean_squared_error(X_test.fillna(imputed_test.mean()), imputed_test.fillna(imputed_test.mean()))
print('Imputation MSE:', mse)





    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho



ValueError: in user code:

    File "/home/pn7hao/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/home/pn7hao/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/pn7hao/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/home/pn7hao/.local/lib/python3.10/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/home/pn7hao/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/pn7hao/.local/lib/python3.10/site-packages/keras/engine/input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "model_37" is incompatible with the layer: expected shape=(None, 13), found shape=(128, 26)


In [26]:
X_train.shape

(404, 13)