In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler


In [101]:

# Load dataset
data = pd.read_csv('../data/bsc_project_set.csv')

# Drop id
data = data.drop(['id', 'Unnamed: 0'], axis=1)

data_missing = data
data_complete = data.dropna()
data = data_complete


# Convert categorical data into numeric
# 'sex' and 'peep_regime' are categorical, use pd.get_dummies
categorical_columns = ['sex', 'peep_regime']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)
data = data.drop(['sex_F','peep_regime_low'], axis = 1)
data['mort_28'] = data['mort_28'].astype(int)
data['sex_M'] = data['sex_M'].astype(int)
data['peep_regime_high'] = data['peep_regime_high'].astype(int)

# print("Before normalizing")
# data.head()

numeric_columns = data.columns
# print(numeric_columns)
continuous_columns = data.columns.difference(['mort_28', 'sex_M', 'peep_regime_high'])

# Normalize data
# scaler = StandardScaler()
# scaler = Normalizer()
# data[continuous_columns] = scaler.fit_transform(data[continuous_columns])
scaler = MinMaxScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])


# print("After normalization")
# data.head()

# Randomly remove values from the complete dataset
def randomly_remove_values(df, fraction, columns):
    df_nan = df.copy()
    for col in columns:
        df_nan.loc[df_nan.sample(frac=fraction).index, col] = np.nan
    return df_nan

# Create a dataset with randomly removed values
fraction_missing = 0.07  # 7% missing values
data_incomplete = randomly_remove_values(data, fraction_missing, continuous_columns)

# data_incomplete


In [102]:
# Impute using Iterative Imputer
iterative_imputer = IterativeImputer(max_iter=10, random_state=768)
df_iterative_imputed = pd.DataFrame(iterative_imputer.fit_transform(data_incomplete), columns=data_incomplete.columns)

In [103]:
# Impute using KNN Imputer
knn_imputer = KNNImputer(n_neighbors=11, weights='uniform')
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(data_incomplete), columns=data_incomplete.columns)

In [104]:
# Function to calculate the imputation error
def calculate_imputation_error(original, imputed, mask):
    return mean_squared_error(original[mask], imputed[mask])

# Create a mask for the removed values
mask = data_incomplete.isna()

# mask.head()
# data[mask].head()



# Impute missing data
# imputer = SimpleImputer(strategy='mean')
# imputer = KNNImputer(n_neighbors=5, weights='uniform')
# imputer = IterativeImputer(random_state=768)

# data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

In [105]:
# Calculate error for KNN Imputer
knn_error = calculate_imputation_error(data, df_knn_imputed, mask.values)
# knn_error = mean_squared_error(data, df_knn_imputed)
print(f'KNN Imputer MSE: {knn_error}')

# Calculate error for Iterative Imputer
iterative_error = calculate_imputation_error(data, df_iterative_imputed, mask.values)
# iterative_error = mean_squared_error(data, df_iterative_imputed)
print(f'Iterative Imputer MSE: {iterative_error}')


In [95]:
# df_knn_imputed

In [8]:
# df_iterative_imputed

In [9]:
# data

In [10]:
# print(np.sum(data_missing.isna(), axis=0))
# missing_total = 0
# not_missing_total = 0
# for col in data_missing.columns:
    
# Total amount of data (total number of entries)
# data_missing
total_data = data_missing.shape[0] * data_missing.shape[1]  # Total rows multiplied by total columns

# Number of missing values
missing_values = data_missing.isna().sum().sum()

# Percentage of missing data
percentage_missing = (missing_values / total_data) * 100

print(f"Total amount of data: {total_data}")
print(f"Number of missing values: {missing_values}")
print(f"Percentage of missing data: {percentage_missing:.2f}%")


