In [2]:
import pandas as pd

data = pd.read_csv("Economic_data.csv")

missing_data_percentage = (data.isna().mean() * 100).round(2)

print(missing_data_percentage)

Country                          0.00
Year                             0.00
GDP_Growth_Rate                  2.82
Inflation_Rate                  21.83
GDP_Per_Capita                   4.80
Unemployment_Rate               41.29
Cost of Living Index            87.58
Local Purchasing Power Index    87.58
dtype: float64


As we can observe from the above table that the percentage of missing data for GDP_Growth_Rate and GDP_Per_Capita 
is the lowest so we will be taking these two columns as our base data. Then we will be dropping the NA values from these columns. Furthermore, we will delete some of the values at rendom from these columns to test our imputation techniques. We will sort of create the synthetic data and then impute on that to compare it with out original data.

#### Creating the synthetic data

In [7]:
import numpy as np
base_data = data[['GDP_Growth_Rate', 'GDP_Per_Capita']].dropna()

np.random.seed(76)
remove_percentage = 0.3
total_values_gdp_growth = base_data['GDP_Growth_Rate'].shape[0]
total_values_gdp_per_capita = base_data['GDP_Per_Capita'].shape[0]

num_gdp_growth = int(remove_percentage * total_values_gdp_growth)
num_gdp_per_capita = int(remove_percentage * total_values_gdp_per_capita)

gdp_growth_nan_indices = np.random.choice(base_data.index, num_gdp_growth, replace=False)
gdp_per_capita_nan_indices = np.random.choice(base_data.index, num_gdp_per_capita, replace=False)

base_data.loc[gdp_growth_nan_indices, 'GDP_Growth_Rate'] = np.nan
base_data.loc[gdp_per_capita_nan_indices, 'GDP_Per_Capita'] = np.nan

base_data.to_csv("synthetic_data.csv",index=False)

In [9]:
syn_test = data[['GDP_Growth_Rate', 'GDP_Per_Capita']].dropna()
syn_test.to_csv("synthetic_data_full.csv",index=False)

#### Creating a data_imputation model for synthetic data

In [13]:
data = pd.read_csv("synthetic_data.csv")

import miceforest as mf
import matplotlib.pyplot as plt

kernel = mf.ImputationKernel(
    data,
    num_datasets = 1, 
    mean_match_candidates=0,
    random_state = 34
    )

kernel.mice(30)
imputed_data = kernel.complete_data(0)

imputed_data.to_csv('imputed_synthetic_data.csv', index=False)

In [16]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

synthetic_data_full = pd.read_csv('synthetic_data_full.csv')
imputed_synthetic_data = pd.read_csv('imputed_synthetic_data.csv')

columns_to_evaluate = ['GDP_Growth_Rate', 'GDP_Per_Capita']

r2_results = {}
rmse_results = {}
mae_results = {}

for column in columns_to_evaluate:
    original_values = synthetic_data_full[column]
    imputed_values = imputed_synthetic_data[column]

    missing_indices = original_values.isna()
    original_values_non_missing = original_values[~missing_indices]
    imputed_values_non_missing = imputed_values[~missing_indices]

    r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
    r2_results[column] = round(r2, 2)

    rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
    rmse_results[column] = round(rmse, 2)

    mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
    mae_results[column] = round(mae, 2)

print("R² Scores:")
print(r2_results)

print("\nRMSE Scores:")
print(rmse_results)

print("\nMAE Scores:")
print(mae_results)

R² Scores:
{'GDP_Growth_Rate': 0.73, 'GDP_Per_Capita': 0.66}

RMSE Scores:
{'GDP_Growth_Rate': 3.37, 'GDP_Per_Capita': 10071.25}

MAE Scores:
{'GDP_Growth_Rate': 1.12, 'GDP_Per_Capita': 3883.81}


In [2]:
import pandas as pd
import numpy as np
import random
import miceforest as mf
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load full synthetic dataset for initial base
original_data = pd.read_csv("synthetic_data.csv")
synthetic_data_full = pd.read_csv("synthetic_data_full.csv")

# Specify columns to evaluate
columns_to_evaluate = ['GDP_Growth_Rate', 'GDP_Per_Capita']

# Dictionaries to store metrics across iterations
r2_all_iterations = {col: [] for col in columns_to_evaluate}
rmse_all_iterations = {col: [] for col in columns_to_evaluate}
mae_all_iterations = {col: [] for col in columns_to_evaluate}

# Parameters for random NaN generation
remove_percentage = 0.3
total_values_gdp_growth = original_data['GDP_Growth_Rate'].shape[0]
total_values_gdp_per_capita = original_data['GDP_Per_Capita'].shape[0]
num_gdp_growth = int(remove_percentage * total_values_gdp_growth)
num_gdp_per_capita = int(remove_percentage * total_values_gdp_per_capita)

# Run MICE imputation for 30 iterations with random NaN values
for i in range(30):
    # Create a copy of the data to introduce NaN values
    data = original_data.copy()
    
    # Randomly select indices to set as NaN for each iteration
    gdp_growth_nan_indices = random.sample(list(data.index), num_gdp_growth)
    gdp_per_capita_nan_indices = random.sample(list(data.index), num_gdp_per_capita)
    data.loc[gdp_growth_nan_indices, 'GDP_Growth_Rate'] = np.nan
    data.loc[gdp_per_capita_nan_indices, 'GDP_Per_Capita'] = np.nan
    
    # Initialize the kernel and perform MICE for 1 iteration
    kernel = mf.ImputationKernel(data, num_datasets=1, mean_match_candidates=0, random_state=34)
    kernel.mice(1)
    
    # Get the completed dataset
    imputed_data = kernel.complete_data(0)
    
    # Calculate metrics for each specified column
    for column in columns_to_evaluate:
        original_values = synthetic_data_full[column]
        imputed_values = imputed_data[column]

        # Get non-missing values
        missing_indices = original_values.isna()
        original_values_non_missing = original_values[~missing_indices]
        imputed_values_non_missing = imputed_values[~missing_indices]

        # R² score
        r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
        r2_all_iterations[column].append(r2)

        # RMSE
        rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
        rmse_all_iterations[column].append(rmse)

        # MAE
        mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
        mae_all_iterations[column].append(mae)

# Calculate mean and standard deviation of metrics across iterations
metrics_summary = {
    "Metric": [],
    "Column": [],
    "Mean Value": [],
    "Standard Deviation": []
}

# Aggregating results for R², RMSE, and MAE
for column in columns_to_evaluate:
    for metric, values_dict in zip(["R²", "RMSE", "MAE"], [r2_all_iterations, rmse_all_iterations, mae_all_iterations]):
        mean_val = np.mean(values_dict[column])
        std_dev = np.std(values_dict[column])
        
        metrics_summary["Metric"].append(metric)
        metrics_summary["Column"].append(column)
        metrics_summary["Mean Value"].append(round(mean_val, 2))
        metrics_summary["Standard Deviation"].append(round(std_dev, 2))

# Convert summary to a DataFrame and display
metrics_summary_df = pd.DataFrame(metrics_summary)
print(metrics_summary_df)


  Metric           Column  Mean Value  Standard Deviation
0     R²  GDP_Growth_Rate        0.51                0.04
1   RMSE  GDP_Growth_Rate        4.51                0.17
2    MAE  GDP_Growth_Rate        1.92                0.03
3     R²   GDP_Per_Capita        0.51                0.03
4   RMSE   GDP_Per_Capita    12004.46              355.89
5    MAE   GDP_Per_Capita     5408.87               56.89
