In [19]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer


In [20]:
baseline = pd.read_csv('D:/missing_data/dat/baseline_dataset.csv')
df = pd.read_csv('D:/missing_data/dat/dataset_with_missing_values.csv')

In [23]:
df = df.drop('Target', axis=1)

In [24]:
baseline = baseline.drop('Target', axis=1)

In [42]:
print(baseline)

    Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0    0.374540   0.950714   0.731994   0.598658   0.156019
1    0.155995   0.058084   0.866176   0.601115   0.708073
2    0.020584   0.969910   0.832443   0.212339   0.181825
3    0.183405   0.304242   0.524756   0.431945   0.291229
4    0.611853   0.139494   0.292145   0.366362   0.456070
..        ...        ...        ...        ...        ...
95   0.992965   0.073797   0.553854   0.969303   0.523098
96   0.629399   0.695749   0.454541   0.627558   0.584314
97   0.901158   0.045446   0.280963   0.950411   0.890264
98   0.455657   0.620133   0.277381   0.188121   0.463698
99   0.353352   0.583656   0.077735   0.974395   0.986211

[100 rows x 5 columns]


In [43]:
print(df)

    Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0         NaN        NaN   0.731994        NaN        NaN
1    0.155995   0.058084        NaN   0.601115   0.708073
2    0.020584        NaN        NaN   0.212339   0.181825
3    0.183405   0.304242   0.524756   0.431945   0.291229
4         NaN   0.139494   0.292145   0.366362        NaN
..        ...        ...        ...        ...        ...
95        NaN        NaN   0.553854   0.969303        NaN
96   0.629399   0.695749   0.454541   0.627558   0.584314
97        NaN   0.045446        NaN        NaN        NaN
98        NaN   0.620133   0.277381   0.188121   0.463698
99        NaN   0.583656        NaN   0.974395   0.986211

[100 rows x 5 columns]


In [44]:
X = df
y = baseline

In [45]:
imputers = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
    'mode': SimpleImputer(strategy='most_frequent'),
    'constant': SimpleImputer(strategy='constant', fill_value=0),
    'KNN': KNNImputer(n_neighbors=5)
    
}

In [46]:
results = {}

In [47]:
for strategy, imputer in imputers.items():
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test.values.flatten(), predictions.flatten())
    r2 = r2_score(y_test.values.flatten(), predictions.flatten())

    accuracy = accuracy_score(y_test.values.flatten().round(), predictions.flatten().round())

    results[strategy] = {'MSE': mse, 'R2': r2, 'Accuracy': accuracy}

In [48]:
print("Comparison")
for strategy, metrics in results.items():
    print(f"{strategy.capitalize()} - MSE: {metrics['MSE']:.4f}, R2: {metrics['R2']:.4f}, Accuracy: {metrics['Accuracy']:.4f}")

Comparison
Mean - MSE: 0.0674, R2: 0.0747, Accuracy: 0.6800
Median - MSE: 0.0697, R2: 0.0436, Accuracy: 0.6600
Mode - MSE: 0.0761, R2: -0.0443, Accuracy: 0.6933
Constant - MSE: 0.0742, R2: -0.0190, Accuracy: 0.6800
Knn - MSE: 0.0710, R2: 0.0245, Accuracy: 0.6667
