In [1]:
import pandas as pd 
import numpy as np
import miceforest as mf
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split 

In [2]:
data = pd.read_csv("Institutional_data_full.csv")
data.replace("..", np.nan, inplace=True)
missing_data_percentage = (data.isna().mean() * 100).round(2)
print(missing_data_percentage)

Country                     0.00
Year                        0.00
Control_of_Corruption       4.01
Government_Effectiveness    4.48
Political_Stability         3.31
Regulatory_Quality          4.44
Rule_of_Law                 2.26
Voice_and_Accountability    3.13
dtype: float64


As we have very less missing vlaues in all the columns, we will just drop these from the columns and use that as our original dataset and then create the synthetic dataset from that dataset.

In [3]:
base_data = data[['Control_of_Corruption','Government_Effectiveness','Political_Stability','Regulatory_Quality',
                'Rule_of_Law','Voice_and_Accountability']].dropna()

base_data.to_csv("synthetic_data_institution_full.csv",index = False)

np.random.seed(76)
remove_percentage = 0.3

total_values = base_data.shape[0]
num_values_to_remove = int(remove_percentage * total_values)

nan_indices_Control_of_Corruption = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Government_Effectiveness = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Political_Stability = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Regulatory_Quality = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Rule_of_Law = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Voice_and_Accountability = np.random.choice(base_data.index, num_values_to_remove, replace=False)

base_data.loc[nan_indices_Control_of_Corruption, 'Control_of_Corruption'] = np.nan
base_data.loc[nan_indices_Government_Effectiveness, 'Government_Effectiveness'] = np.nan
base_data.loc[nan_indices_Political_Stability, 'Political_Stability'] = np.nan
base_data.loc[nan_indices_Regulatory_Quality, 'Regulatory_Quality'] = np.nan
base_data.loc[nan_indices_Rule_of_Law, 'Rule_of_Law'] = np.nan
base_data.loc[nan_indices_Voice_and_Accountability, 'Voice_and_Accountability'] = np.nan

base_data.to_csv("synthetic_data_institution.csv", index=False)

In [4]:
data_imputation = pd.read_csv("synthetic_data_institution.csv")

kernel = mf.ImputationKernel(
    data_imputation,
    num_datasets = 1,
    mean_match_candidates=0,
    random_state = 28
    )

optimal_params = kernel.tune_parameters(
    dataset=0, 
    use_gbdt=True,
    num_iterations=500,
    random_state=1,
)
kernel.mice(1, variable_parameters=optimal_params)
pd.DataFrame(optimal_params)

Unnamed: 0,Control_of_Corruption,Government_Effectiveness,Political_Stability,Regulatory_Quality,Rule_of_Law,Voice_and_Accountability
boosting,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt
data_sample_strategy,bagging,bagging,bagging,bagging,bagging,bagging
num_iterations,341,312,278,500,378,236
max_depth,5,4,4,5,3,4
num_leaves,23,17,24,19,18,15
min_data_in_leaf,2,6,5,7,54,7
min_sum_hessian_in_leaf,0.01,0.01,0.01,0.01,0.01,0.01
min_gain_to_split,0.0,0.0,0.0,0.0,0.0,0.0
bagging_fraction,0.999136,0.501521,0.307914,0.795465,0.484219,0.989755
feature_fraction_bynode,0.31248,0.299912,0.580104,0.237637,0.776686,0.621771


In [7]:
kernel.mice(30,variable_parameters=optimal_params)
imputed_data = kernel.complete_data(0)

imputed_data.to_csv("synthetic_data_institution_imputed.csv", index=False)

In [9]:
synthetic_data_full = pd.read_csv('synthetic_data_institution_full.csv')
imputed_synthetic_data = pd.read_csv('synthetic_data_institution_imputed.csv')

columns_to_evaluate = ['Control_of_Corruption','Government_Effectiveness','Political_Stability','Regulatory_Quality',
                'Rule_of_Law','Voice_and_Accountability']


r2_results = {}
rmse_results = {}
mae_results = {}

for column in columns_to_evaluate:
    original_values = synthetic_data_full[column]
    imputed_values = imputed_synthetic_data[column]

    missing_indices = original_values.isna()
    original_values_non_missing = original_values[~missing_indices]
    imputed_values_non_missing = imputed_values[~missing_indices]

    r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
    r2_results[column] = round(r2, 2)

    rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
    rmse_results[column] = round(rmse, 2)

    mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
    mae_results[column] = round(mae, 2)

print("R² Scores:")
print(r2_results)

print("\nRMSE Scores:")
print(rmse_results)

print("\nMAE Scores:")
print(mae_results)

R² Scores:
{'Control_of_Corruption': 0.97, 'Government_Effectiveness': 0.97, 'Political_Stability': 0.91, 'Regulatory_Quality': 0.96, 'Rule_of_Law': 0.98, 'Voice_and_Accountability': 0.92}

RMSE Scores:
{'Control_of_Corruption': 0.16, 'Government_Effectiveness': 0.16, 'Political_Stability': 0.3, 'Regulatory_Quality': 0.19, 'Rule_of_Law': 0.15, 'Voice_and_Accountability': 0.29}

MAE Scores:
{'Control_of_Corruption': 0.07, 'Government_Effectiveness': 0.07, 'Political_Stability': 0.12, 'Regulatory_Quality': 0.08, 'Rule_of_Law': 0.06, 'Voice_and_Accountability': 0.12}


In [7]:
import random

original_data = pd.read_csv('synthetic_data_institution_full.csv')

columns_to_evaluate = ['Control_of_Corruption', 'Government_Effectiveness', 'Political_Stability', 
                       'Regulatory_Quality', 'Rule_of_Law', 'Voice_and_Accountability']

# Initialize dictionaries to store metrics across iterations
r2_results = {col: [] for col in columns_to_evaluate}
rmse_results = {col: [] for col in columns_to_evaluate}
mae_results = {col: [] for col in columns_to_evaluate}

# Parameters for random NaN generation
remove_percentage = 0.3
num_iterations = 30

# Optimal parameters for miceforest
optimal_params = {
    # Define optimal parameters here
}

# Run the evaluation over multiple iterations with random NaN values
for i in range(num_iterations):
    # Create a new copy of the data to introduce NaN values each time
    data = original_data.copy()
    
    # Randomly introduce NaN values in each column for each iteration
    for column in columns_to_evaluate:
        num_missing = int(remove_percentage * data[column].dropna().shape[0])
        nan_indices = random.sample(list(data[column].dropna().index), num_missing)
        data.loc[nan_indices, column] = np.nan
    
    # Initialize the MICE kernel and perform imputation with optimal parameters
    kernel = mf.ImputationKernel(data, num_datasets=1, random_state=34)
    kernel.mice(1, variable_parameters=optimal_params)
    
    # Complete the dataset and save it
    imputed_data = kernel.complete_data(0)
    imputed_data.to_csv("synthetic_data_institution_imputed.csv", index=False)
    
    # Calculate metrics for each column
    for column in columns_to_evaluate:
        original_values = original_data[column]
        imputed_values = imputed_data[column]

        # Identify non-missing indices in the original dataset
        missing_indices = original_values.isna()
        original_values_non_missing = original_values[~missing_indices]
        imputed_values_non_missing = imputed_values[~missing_indices]

        # R² score
        r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
        r2_results[column].append(r2)

        # RMSE
        rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
        rmse_results[column].append(rmse)

        # MAE
        mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
        mae_results[column].append(mae)

# Calculate mean and standard deviation of metrics across iterations
metrics_summary = {
    "Metric": [],
    "Column": [],
    "Mean Value": [],
    "Standard Deviation": []
}

# Aggregate results for R², RMSE, and MAE
for column in columns_to_evaluate:
    for metric, values_dict in zip(["R²", "RMSE", "MAE"], [r2_results, rmse_results, mae_results]):
        mean_val = np.mean(values_dict[column])
        std_dev = np.std(values_dict[column])
        
        metrics_summary["Metric"].append(metric)
        metrics_summary["Column"].append(column)
        metrics_summary["Mean Value"].append(round(mean_val, 2))
        metrics_summary["Standard Deviation"].append(round(std_dev, 4))

# Convert summary to a DataFrame and display
metrics_summary_df = pd.DataFrame(metrics_summary)
print(metrics_summary_df)


   Metric                    Column  Mean Value  Standard Deviation
0      R²     Control_of_Corruption        0.90              0.0056
1    RMSE     Control_of_Corruption        0.32              0.0087
2     MAE     Control_of_Corruption        0.13              0.0033
3      R²  Government_Effectiveness        0.89              0.0054
4    RMSE  Government_Effectiveness        0.32              0.0084
5     MAE  Government_Effectiveness        0.13              0.0026
6      R²       Political_Stability        0.78              0.0097
7    RMSE       Political_Stability        0.46              0.0105
8     MAE       Political_Stability        0.19              0.0043
9      R²        Regulatory_Quality        0.87              0.0058
10   RMSE        Regulatory_Quality        0.35              0.0082
11    MAE        Regulatory_Quality        0.14              0.0031
12     R²               Rule_of_Law        0.91              0.0054
13   RMSE               Rule_of_Law        0.30 