In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [21]:
import warnings
warnings.filterwarnings("ignore")

In [22]:
full_data = pd.read_csv('D:\missing_data\dat\classification_dataset.csv')
data_with_missing = pd.read_csv('D:\missing_data\dat\classification_dataset_with_missing.csv')

In [29]:
print(full_data)

     feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0    -2.568916  -0.257409  -2.679357   3.864818   2.564998  -0.737556   
1     0.622861   0.534544   0.018283  -0.283382   1.907637  -0.341310   
2    -0.171251  -0.496278   1.613347   2.488069  -1.677966   0.303604   
3    -0.871423  -0.333946   3.368446   0.972153  -0.134388   0.212820   
4     2.346402  -0.699965  -0.203251  -0.256745  -1.974251   0.619663   
..         ...        ...        ...        ...        ...        ...   
995  -2.478330  -1.104726  -0.900796   1.451043  -0.150230  -0.921653   
996   2.192353  -0.444433  -1.439293  -1.022223   2.027416   1.280332   
997   1.497338  -1.148851  -0.787342  -1.108765  -0.492599  -0.553695   
998  -0.173973   0.167813   2.607611   0.628983   3.464411  -0.796327   
999  -1.586110   0.893592  -1.307421   1.731974   1.417739   0.026572   

     feature_7  feature_8  feature_9  feature_10  target  
0    -3.330985  -1.213370  -1.473105   -0.846386       1  
1    

In [30]:
print(data_with_missing)

     feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
1          NaN        NaN        NaN        NaN   1.907637  -0.341310   
2    -0.171251  -0.496278        NaN        NaN        NaN        NaN   
3    -0.871423        NaN        NaN        NaN        NaN        NaN   
5          NaN        NaN        NaN        NaN   2.857559        NaN   
9    -1.220030        NaN        NaN   0.279617        NaN   0.973366   
..         ...        ...        ...        ...        ...        ...   
992        NaN   0.436733  -4.898620   4.508056   3.856748        NaN   
994        NaN  -0.476398        NaN        NaN   0.143255   0.055043   
995  -2.478330  -1.104726        NaN   1.451043        NaN        NaN   
998        NaN        NaN   2.607611        NaN   3.464411        NaN   
999        NaN   0.893592  -1.307421   1.731974   1.417739        NaN   

     feature_7  feature_8  feature_9  feature_10  target  
1     1.206240        NaN        NaN         NaN     1.0  
2    

In [23]:
data_with_missing = data_with_missing.dropna(subset=['target'])

In [24]:
def evaluate_dataset(data, handle_missing=False, strategy='mean', constant_value=None, use_knn=False):
    X = data.drop(columns=['target'])
    y = data['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if handle_missing:
        if use_knn:
            imputer = KNNImputer()
        elif strategy == 'constant':
            imputer = SimpleImputer(strategy='constant', fill_value=constant_value)
        else:
            imputer = SimpleImputer(strategy=strategy)

        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)

        if np.any(np.isnan(X_train)) or np.any(np.isnan(X_test)):
            print("Warning: NaN values remain after imputation!")

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [25]:
print("\nEvaluating full dataset:")
full_accuracy = evaluate_dataset(full_data)
print(f"Accuracy: {full_accuracy:.5f}")


Evaluating full dataset:
Accuracy: 0.87500


In [26]:
results = {}
strategies = ['mean', 'median', 'most_frequent', 'constant']
for strategy in strategies:
    if strategy == 'constant':
        print(f"\nEvaluating dataset with missing values (strategy: {strategy}, constant_value=0):")
        accuracy = evaluate_dataset(data_with_missing, handle_missing=True, strategy=strategy, constant_value=0)
    else:
        print(f"\nEvaluating dataset with missing values (strategy: {strategy}):")
        accuracy = evaluate_dataset(data_with_missing, handle_missing=True, strategy=strategy)
    results[strategy] = accuracy
    print(f"Accuracy: {accuracy:.5f}")


Evaluating dataset with missing values (strategy: mean):
Accuracy: 0.50000

Evaluating dataset with missing values (strategy: median):
Accuracy: 0.50980

Evaluating dataset with missing values (strategy: most_frequent):
Accuracy: 0.51961

Evaluating dataset with missing values (strategy: constant, constant_value=0):
Accuracy: 0.54902


In [27]:
print("\nEvaluating dataset with missing values (strategy: KNN):")
knn_accuracy = evaluate_dataset(data_with_missing, handle_missing=True, use_knn=True)
results['knn'] = knn_accuracy
print(f"Accuracy: {knn_accuracy:.5f}")


Evaluating dataset with missing values (strategy: KNN):
Accuracy: 0.54902


In [28]:
print("\nSummary of accuracies:")
for strategy, accuracy in results.items():
    print(f"{strategy}: {accuracy:.5f}")


Summary of accuracies:
mean: 0.50000
median: 0.50980
most_frequent: 0.51961
constant: 0.54902
knn: 0.54902
