In [1]:
import pandas as pd 
import numpy as np
import miceforest as mf
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split 

In [2]:
data = pd.read_csv("Sustainability_Data.csv")
missing_data_percentage = (data.isna().mean() * 100).round(2)
print(missing_data_percentage)

Country                                  0.00
Year                                     0.00
Carbon_dioxide_emissions                44.21
Electricity_production_non_renewable    45.30
Electricity production_renewable        45.30
Micro_air_pollution                     55.78
Greenhouse_emission                     43.07
dtype: float64


As we have very less missing vlaues in all the columns, we will just drop these from the columns and use that as our original dataset and then create the synthetic dataset from that dataset.

In [3]:
base_data = data[['Carbon_dioxide_emissions','Electricity_production_non_renewable','Electricity production_renewable',
                'Micro_air_pollution', 'Greenhouse_emission']].dropna()

base_data.to_csv("synthetic_data_sustainability_full.csv",index = False)

np.random.seed(54)
remove_percentage = 0.3

total_values = base_data.shape[0]
num_values_to_remove = int(remove_percentage * total_values)

nan_indices_Carbon_dioxide_emissions = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Electricity_production_non_renewable = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Electricity_production_renewable = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Micro_air_pollution = np.random.choice(base_data.index, num_values_to_remove, replace=False)
nan_indices_Greenhouse_emission = np.random.choice(base_data.index, num_values_to_remove, replace=False)

base_data.loc[nan_indices_Carbon_dioxide_emissions, 'Carbon_dioxide_emissions'] = np.nan
base_data.loc[nan_indices_Electricity_production_non_renewable, 'Electricity_production_non_renewable'] = np.nan
base_data.loc[nan_indices_Electricity_production_renewable, 'Electricity production_renewable'] = np.nan
base_data.loc[nan_indices_Micro_air_pollution, 'Micro_air_pollution'] = np.nan
base_data.loc[nan_indices_Greenhouse_emission, 'Greenhouse_emission'] = np.nan

base_data.to_csv("synthetic_data_sustainability.csv", index=False)

In [4]:
data_imputation = pd.read_csv("synthetic_data_sustainability.csv")

kernel = mf.ImputationKernel(
    data_imputation,
    num_datasets = 1,
    mean_match_candidates=0,
    random_state = 28
    )

optimal_params = kernel.tune_parameters(
    dataset=0, 
    use_gbdt=True,
    num_iterations=500,
    random_state=1,
)
kernel.mice(1, variable_parameters=optimal_params)
pd.DataFrame(optimal_params)

Unnamed: 0,Carbon_dioxide_emissions,Electricity_production_non_renewable,Electricity production_renewable,Micro_air_pollution,Greenhouse_emission
boosting,gbdt,gbdt,gbdt,gbdt,gbdt
data_sample_strategy,bagging,bagging,bagging,bagging,bagging
num_iterations,235,204,166,186,500
max_depth,4,4,4,5,2
num_leaves,5,17,24,18,8
min_data_in_leaf,3,6,4,1,4
min_sum_hessian_in_leaf,0.01,0.01,0.01,0.01,0.01
min_gain_to_split,0.0,0.0,0.0,0.0,0.0
bagging_fraction,0.82067,0.501521,0.307914,0.779917,0.970901
feature_fraction_bynode,0.971435,0.299912,0.580104,0.778489,0.350487


In [6]:
kernel.mice(30,variable_parameters=optimal_params)
imputed_data = kernel.complete_data(0)

imputed_data.to_csv("synthetic_data_sustainability_imputed.csv", index=False)

In [7]:
synthetic_data_full = pd.read_csv('synthetic_data_sustainability_full.csv')
imputed_synthetic_data = pd.read_csv('synthetic_data_sustainability_imputed.csv')

columns_to_evaluate = ['Carbon_dioxide_emissions','Electricity_production_non_renewable','Electricity production_renewable',
                'Micro_air_pollution', 'Greenhouse_emission']


r2_results = {}
rmse_results = {}
mae_results = {}

for column in columns_to_evaluate:
    original_values = synthetic_data_full[column]
    imputed_values = imputed_synthetic_data[column]

    missing_indices = original_values.isna()
    original_values_non_missing = original_values[~missing_indices]
    imputed_values_non_missing = imputed_values[~missing_indices]

    r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
    r2_results[column] = round(r2, 2)

    rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
    rmse_results[column] = round(rmse, 2)

    mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
    mae_results[column] = round(mae, 2)

print("R² Scores:")
print(r2_results)

print("\nRMSE Scores:")
print(rmse_results)

print("\nMAE Scores:")
print(mae_results)

R² Scores:
{'Carbon_dioxide_emissions': 0.79, 'Electricity_production_non_renewable': 0.75, 'Electricity production_renewable': 0.74, 'Micro_air_pollution': 0.75, 'Greenhouse_emission': 0.82}

RMSE Scores:
{'Carbon_dioxide_emissions': np.float64(58.79), 'Electricity_production_non_renewable': np.float64(16.85), 'Electricity production_renewable': np.float64(3.13), 'Micro_air_pollution': np.float64(8.12), 'Greenhouse_emission': np.float64(24.96)}

MAE Scores:
{'Carbon_dioxide_emissions': np.float64(16.43), 'Electricity_production_non_renewable': np.float64(7.78), 'Electricity production_renewable': np.float64(1.03), 'Micro_air_pollution': np.float64(3.34), 'Greenhouse_emission': np.float64(8.52)}


In [5]:
import random

original_data = pd.read_csv('synthetic_data_sustainability_full.csv')

columns_to_evaluate = ['Carbon_dioxide_emissions','Electricity_production_non_renewable','Electricity production_renewable',
                'Micro_air_pollution', 'Greenhouse_emission']

# Initialize dictionaries to store metrics across iterations
r2_results = {col: [] for col in columns_to_evaluate}
rmse_results = {col: [] for col in columns_to_evaluate}
mae_results = {col: [] for col in columns_to_evaluate}

# Parameters for random NaN generation
remove_percentage = 0.3
num_iterations = 30

# Optimal parameters for miceforest
optimal_params = {
    # Define optimal parameters here
}

# Run the evaluation over multiple iterations with random NaN values
for i in range(num_iterations):
    # Create a new copy of the data to introduce NaN values each time
    data = original_data.copy()
    
    # Randomly introduce NaN values in each column for each iteration
    for column in columns_to_evaluate:
        num_missing = int(remove_percentage * data[column].dropna().shape[0])
        nan_indices = random.sample(list(data[column].dropna().index), num_missing)
        data.loc[nan_indices, column] = np.nan
    
    # Initialize the MICE kernel and perform imputation with optimal parameters
    kernel = mf.ImputationKernel(data, num_datasets=1, random_state=34)
    kernel.mice(1, variable_parameters=optimal_params)
    
    # Complete the dataset and save it
    imputed_data = kernel.complete_data(0)
    imputed_data.to_csv("synthetic_data_sustainability_imputed.csv", index=False)
    
    # Calculate metrics for each column
    for column in columns_to_evaluate:
        original_values = original_data[column]
        imputed_values = imputed_data[column]

        # Identify non-missing indices in the original dataset
        missing_indices = original_values.isna()
        original_values_non_missing = original_values[~missing_indices]
        imputed_values_non_missing = imputed_values[~missing_indices]

        # R² score
        r2 = r2_score(original_values_non_missing, imputed_values_non_missing)
        r2_results[column].append(r2)

        # RMSE
        rmse = np.sqrt(mean_squared_error(original_values_non_missing, imputed_values_non_missing))
        rmse_results[column].append(rmse)

        # MAE
        mae = mean_absolute_error(original_values_non_missing, imputed_values_non_missing)
        mae_results[column].append(mae)

# Calculate mean and standard deviation of metrics across iterations
metrics_summary = {
    "Metric": [],
    "Column": [],
    "Mean Value": [],
    "Standard Deviation": []
}

# Aggregate results for R², RMSE, and MAE
for column in columns_to_evaluate:
    for metric, values_dict in zip(["R²", "RMSE", "MAE"], [r2_results, rmse_results, mae_results]):
        mean_val = np.mean(values_dict[column])
        std_dev = np.std(values_dict[column])
        
        metrics_summary["Metric"].append(metric)
        metrics_summary["Column"].append(column)
        metrics_summary["Mean Value"].append(round(mean_val, 2))
        metrics_summary["Standard Deviation"].append(round(std_dev, 4))

# Convert summary to a DataFrame and display
metrics_summary_df = pd.DataFrame(metrics_summary)
print(metrics_summary_df)


   Metric                                Column  Mean Value  \
0      R²              Carbon_dioxide_emissions        0.68   
1    RMSE              Carbon_dioxide_emissions       72.93   
2     MAE              Carbon_dioxide_emissions       21.65   
3      R²  Electricity_production_non_renewable        0.54   
4    RMSE  Electricity_production_non_renewable       23.18   
5     MAE  Electricity_production_non_renewable        9.91   
6      R²      Electricity production_renewable        0.57   
7    RMSE      Electricity production_renewable        4.04   
8     MAE      Electricity production_renewable        1.15   
9      R²                   Micro_air_pollution        0.58   
10   RMSE                   Micro_air_pollution       10.56   
11    MAE                   Micro_air_pollution        4.19   
12     R²                   Greenhouse_emission        0.72   
13   RMSE                   Greenhouse_emission       30.84   
14    MAE                   Greenhouse_emission       1