In [2]:
import pandas as pd 
import miceforest as mf
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
data = pd.read_csv("Quality_of_Life_final.csv")

missing_data_percentage = (data.isna().mean() * 100).round(2)

print(missing_data_percentage)

Country                                                   0.00
Year                                                      0.00
LE_at_birth                                               6.83
Doctors_Per_10000                                        67.91
Access_to_Electricity                                    47.84
Carbon_dioxide_emissions_per_capita_production_tonnes    57.84
Gender_Development_Index                                 66.47
Gender_Inequality_Index                                  66.50
Human_Development_Index                                  61.83
Health_Care_Index                                        92.44
Crime_Index                                              88.78
dtype: float64


In [4]:
data_1991 = data[data['Year'] >= 1991]

null_percentage_1991 = (data_1991.isna().mean() * 100).round(2)

print(null_percentage_1991)

Country                                                   0.00
Year                                                      0.00
LE_at_birth                                              10.33
Doctors_Per_10000                                        63.16
Access_to_Electricity                                    18.44
Carbon_dioxide_emissions_per_capita_production_tonnes    35.17
Gender_Development_Index                                 48.03
Gender_Inequality_Index                                  48.27
Human_Development_Index                                  41.00
Health_Care_Index                                        88.01
Crime_Index                                              82.21
dtype: float64


In [5]:
base_data = data_1991[['LE_at_birth', 'Access_to_Electricity', 'Gender_Development_Index','Human_Development_Index','Gender_Inequality_Index']].dropna()

base_data.to_csv("synthetic_data_healthcare_full.csv",index = False)

np.random.seed(76)
remove_percentage = 0.3

total_values = base_data.shape[0]
num_values_to_remove = int(remove_percentage * total_values)

nan_indices_le_at_birth = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_access_to_electricity = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_gender_dev_index = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_human_dev_index = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_gender_ineq_index = np.random.choice(base_data.index, num_values_to_remove, replace=False)

base_data.loc[nan_indices_le_at_birth, 'LE_at_birth'] = np.nan
base_data.loc[nan_indices_access_to_electricity, 'Access_to_Electricity'] = np.nan
base_data.loc[nan_indices_gender_dev_index, 'Gender_Development_Index'] = np.nan
base_data.loc[nan_indices_human_dev_index, 'Human_Development_Index'] = np.nan
base_data.loc[nan_indices_gender_ineq_index, 'Gender_Inequality_Index'] = np.nan

base_data.to_csv("synthetic_data_healthcare.csv", index=False)

In [6]:
data_imputation = pd.read_csv("synthetic_data_healthcare.csv")

kernel = mf.ImputationKernel(
    data_imputation,
    num_datasets = 1,
    mean_match_candidates=0,
    random_state = 28
    )

optimal_params = kernel.tune_parameters(
    dataset=0, 
    use_gbdt=True,
    num_iterations=500,
    random_state=1,
)
kernel.mice(1, variable_parameters=optimal_params)
pd.DataFrame(optimal_params)

Unnamed: 0,LE_at_birth,Access_to_Electricity,Gender_Development_Index,Human_Development_Index,Gender_Inequality_Index
boosting,gbdt,gbdt,gbdt,gbdt,gbdt
data_sample_strategy,bagging,bagging,bagging,bagging,bagging
num_iterations,260,301,309,200,500
max_depth,4,4,5,4,3
num_leaves,12,17,11,22,18
min_data_in_leaf,4,6,2,9,48
min_sum_hessian_in_leaf,0.01,0.01,0.01,0.01,0.01
min_gain_to_split,0.0,0.0,0.0,0.0,0.0
bagging_fraction,0.580973,0.501521,0.934758,0.917682,0.484219
feature_fraction_bynode,0.922566,0.299912,0.412989,0.661024,0.776686


In [10]:
kernel.mice(30,variable_parameters=optimal_params)
imputed_data = kernel.complete_data(0)

imputed_data.to_csv("synthetic_data_healthcare_imputed.csv", index=False)

In [13]:
synthetic_data_full = pd.read_csv('synthetic_data_healthcare_full.csv')
imputed_synthetic_data = pd.read_csv('synthetic_data_healthcare_imputed.csv')

columns_to_evaluate = ['LE_at_birth', 'Access_to_Electricity', 'Gender_Development_Index','Human_Development_Index','Gender_Inequality_Index']

r2_results = {}
rmse_results = {}
mae_results = {}

for column in columns_to_evaluate:
    original_values = synthetic_data_full[column]
    imputed_values = imputed_synthetic_data[column]

    missing_indices = original_values.isna()
    original_values_non_missing = original_values[~missing_indices]
    imputed_values_non_missing = imputed_values[~missing_indices]

    r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
    r2_results[column] = round(r2, 2)

    rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
    rmse_results[column] = round(rmse, 2)

    mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
    mae_results[column] = round(mae, 2)

print("R² Scores:")
print(r2_results)

print("\nRMSE Scores:")
print(rmse_results)

print("\nMAE Scores:")
print(mae_results)

R² Scores:
{'LE_at_birth': 0.95, 'Access_to_Electricity': 0.95, 'Gender_Development_Index': 0.9, 'Human_Development_Index': 0.97, 'Gender_Inequality_Index': 0.94}

RMSE Scores:
{'LE_at_birth': 2.16, 'Access_to_Electricity': 6.82, 'Gender_Development_Index': 0.02, 'Human_Development_Index': 0.03, 'Gender_Inequality_Index': 0.05}

MAE Scores:
{'LE_at_birth': 0.82, 'Access_to_Electricity': 1.89, 'Gender_Development_Index': 0.01, 'Human_Development_Index': 0.01, 'Gender_Inequality_Index': 0.02}


In [7]:
import random

original_data = pd.read_csv('synthetic_data_healthcare_full.csv')

columns_to_evaluate = ['LE_at_birth', 'Access_to_Electricity', 'Gender_Development_Index','Human_Development_Index','Gender_Inequality_Index']

# Initialize dictionaries to store metrics across iterations
r2_results = {col: [] for col in columns_to_evaluate}
rmse_results = {col: [] for col in columns_to_evaluate}
mae_results = {col: [] for col in columns_to_evaluate}

# Parameters for random NaN generation
remove_percentage = 0.3
num_iterations = 30

# Optimal parameters for miceforest
optimal_params = {
    # Define optimal parameters here
}

# Run the evaluation over multiple iterations with random NaN values
for i in range(num_iterations):
    # Create a new copy of the data to introduce NaN values each time
    data = original_data.copy()
    
    # Randomly introduce NaN values in each column for each iteration
    for column in columns_to_evaluate:
        num_missing = int(remove_percentage * data[column].dropna().shape[0])
        nan_indices = random.sample(list(data[column].dropna().index), num_missing)
        data.loc[nan_indices, column] = np.nan
    
    # Initialize the MICE kernel and perform imputation with optimal parameters
    kernel = mf.ImputationKernel(data, num_datasets=1, random_state=34)
    kernel.mice(1, variable_parameters=optimal_params)
    
    # Complete the dataset and save it
    imputed_data = kernel.complete_data(0)
    imputed_data.to_csv("synthetic_data_QOLI_imputed.csv", index=False)
    
    # Calculate metrics for each column
    for column in columns_to_evaluate:
        original_values = original_data[column]
        imputed_values = imputed_data[column]

        # Identify non-missing indices in the original dataset
        missing_indices = original_values.isna()
        original_values_non_missing = original_values[~missing_indices]
        imputed_values_non_missing = imputed_values[~missing_indices]

        # R² score
        r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
        r2_results[column].append(r2)

        # RMSE
        rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
        rmse_results[column].append(rmse)

        # MAE
        mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
        mae_results[column].append(mae)

# Calculate mean and standard deviation of metrics across iterations
metrics_summary = {
    "Metric": [],
    "Column": [],
    "Mean Value": [],
    "Standard Deviation": []
}

# Aggregate results for R², RMSE, and MAE
for column in columns_to_evaluate:
    for metric, values_dict in zip(["R²", "RMSE", "MAE"], [r2_results, rmse_results, mae_results]):
        mean_val = np.mean(values_dict[column])
        std_dev = np.std(values_dict[column])
        
        metrics_summary["Metric"].append(metric)
        metrics_summary["Column"].append(column)
        metrics_summary["Mean Value"].append(round(mean_val, 2))
        metrics_summary["Standard Deviation"].append(round(std_dev, 4))

# Convert summary to a DataFrame and display
metrics_summary_df = pd.DataFrame(metrics_summary)
print(metrics_summary_df)


   Metric                    Column  Mean Value  Standard Deviation
0      R²               LE_at_birth        0.85              0.0098
1    RMSE               LE_at_birth        3.54              0.1185
2     MAE               LE_at_birth        1.40              0.0435
3      R²     Access_to_Electricity        0.87              0.0103
4    RMSE     Access_to_Electricity       11.40              0.4365
5     MAE     Access_to_Electricity        3.38              0.1456
6      R²  Gender_Development_Index        0.78              0.0125
7    RMSE  Gender_Development_Index        0.03              0.0009
8     MAE  Gender_Development_Index        0.01              0.0004
9      R²   Human_Development_Index        0.89              0.0063
10   RMSE   Human_Development_Index        0.06              0.0015
11    MAE   Human_Development_Index        0.02              0.0006
12     R²   Gender_Inequality_Index        0.85              0.0078
13   RMSE   Gender_Inequality_Index        0.08 

In [None]:
# Load data
synthetic_data = pd.read_csv('synthetic_data_sustainability.csv')
full_data = pd.read_csv('synthetic_data_sustainability_full.csv')

# Configure and fit the imputer
rf_imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        random_state=29
    ),
    random_state=29
)
synthetic_data_imputed = pd.DataFrame(rf_imputer.fit_transform(synthetic_data), columns=synthetic_data.columns)

# Initialize metrics storage
metrics = {'target': [], 'R^2': [], 'RMSE': [], 'MAE': []}
missing_columns = synthetic_data.columns[synthetic_data.isnull().any()]

# Set up subplots
num_columns = len(missing_columns)
fig, axes = plt.subplots(nrows=num_columns // 2 + num_columns % 2, ncols=2, figsize=(14, num_columns * 3))
axes = axes.flatten()  # Flatten for easy indexing

# Loop through each column with missing values
for idx, column in enumerate(missing_columns):
    y_true = full_data[column]
    y_pred = synthetic_data_imputed[column]

    # Calculate metrics
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    # Append metrics for this target
    metrics['target'].append(column)
    metrics['R^2'].append(r2)
    metrics['RMSE'].append(rmse)
    metrics['MAE'].append(mae)
    
    # Plot distribution comparison in subplots
    sns.kdeplot(y_true, label='Original', color='blue', ax=axes[idx])
    sns.kdeplot(y_pred, label='Imputed', color='red', ax=axes[idx])
    axes[idx].set_title(f"Distribution of Original vs Imputed for {column}")
    axes[idx].set_xlabel(column)
    axes[idx].legend()

# Remove any unused subplots
for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Display metrics as a DataFrame
metrics_df = pd.DataFrame(metrics)
print(metrics_df)