In [105]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer


In [106]:
baseline = pd.read_csv('dat/baseline_dataset.csv')
df = pd.read_csv('dat/dataset_with_missing_values.csv')

In [107]:
X = df
y = baseline

In [108]:
imputers = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
    'mode': SimpleImputer(strategy='most_frequent'),
    'constant': SimpleImputer(strategy='constant', fill_value=0),
    'KNN': KNNImputer(n_neighbors=5)
    
}

In [109]:
results = {}

In [112]:
for strategy, imputer in imputers.items():
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.25, random_state=42)

    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test.values.flatten(), predictions.flatten())
    r2 = r2_score(y_test.values.flatten(), predictions.flatten())

    accuracy = accuracy_score(y_test.values.flatten().round(), predictions.flatten().round())

    results[strategy] = {'MSE': mse, 'R2': r2, 'Accuracy': accuracy}

In [113]:
print("Comparison")
for strategy, metrics in results.items():
    print(f"{strategy.capitalize()} - MSE: {metrics['MSE']:.4f}, R2: {metrics['R2']:.4f}, Accuracy: {metrics['Accuracy']:.4f}")

Comparison
Mean - MSE: 0.0556, R2: 0.4719, Accuracy: 0.7067
Median - MSE: 0.0555, R2: 0.4721, Accuracy: 0.7400
Mode - MSE: 0.0640, R2: 0.3918, Accuracy: 0.7400
Constant - MSE: 0.0614, R2: 0.4162, Accuracy: 0.7333
Knn - MSE: 0.0470, R2: 0.5532, Accuracy: 0.7667
