In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Create sample dataset with missing values
np.random.seed(42)
data = {
    'age': [25, np.nan, 35, 40, np.nan, 30],
    'salary': [50000, 60000, np.nan, 80000, 45000, np.nan],
    'department': ['IT', 'HR', np.nan, 'IT', 'Finance', 'HR'],
    'experience': [2, 5, np.nan, 10, 1, 4]
}
df = pd.DataFrame(data)
print("Original dataset with missing values:")
print(df)
print("\nMissing values info:")
print(df.isnull().sum())

print("\n" + "="*50)
print("METHOD 1: DELETE ROWS AND COLUMNS")
print("="*50)

# Delete rows with any missing values
df_drop_rows = df.dropna()
print("\nAfter dropping rows with missing values:")
print(df_drop_rows)

# Delete columns with any missing values
df_drop_cols = df.dropna(axis=1)
print("\nAfter dropping columns with missing values:")
print(df_drop_cols)

print("\n" + "="*50)
print("METHOD 2: BASIC IMPUTATION")
print("="*50)

# Numerical imputation with mean
df_mean = df.copy()
df_mean['age'].fillna(df_mean['age'].mean(), inplace=True)
df_mean['salary'].fillna(df_mean['salary'].mean(), inplace=True)
df_mean['experience'].fillna(df_mean['experience'].mean(), inplace=True)
print("\nNumerical imputation with MEAN:")
print(df_mean)

# Numerical imputation with median
df_median = df.copy()
df_median['age'].fillna(df_median['age'].median(), inplace=True)
df_median['salary'].fillna(df_median['salary'].median(), inplace=True)
df_median['experience'].fillna(df_median['experience'].median(), inplace=True)
print("\nNumerical imputation with MEDIAN:")
print(df_median)

# Numerical imputation with constant
df_constant = df.copy()
df_constant[['age', 'salary', 'experience']] = df_constant[['age', 'salary', 'experience']].fillna(0)
print("\nNumerical imputation with CONSTANT (0):")
print(df_constant)

# Categorical imputation with mode
df_mode = df.copy()
mode_value = df_mode['department'].mode()[0] if not df_mode['department'].mode().empty else 'Unknown'
df_mode['department'].fillna(mode_value, inplace=True)
print("\nCategorical imputation with MODE:")
print(df_mode[['department']])

# Categorical imputation with 'Unknown' label
df_unknown = df.copy()
df_unknown['department'].fillna('Unknown', inplace=True)
print("\nCategorical imputation with 'Unknown' label:")
print(df_unknown[['department']])

print("\n" + "="*50)
print("METHOD 3: ADVANCED IMPUTATION")
print("="*50)

# KNN Imputation
print("\nKNN Imputation:")
df_knn = df.copy()
# Encode categorical variables for KNN
df_encoded = pd.get_dummies(df_knn, columns=['department'], dummy_na=True)
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_encoded), 
                              columns=df_encoded.columns)
print(df_knn_imputed.round(2))

# MICE (Multivariate Imputation by Chained Equations)
print("\nMICE Imputation:")
df_mice = df.copy()
df_mice_encoded = pd.get_dummies(df_mice, columns=['department'], dummy_na=True)
mice_imputer = IterativeImputer(random_state=42)
df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_mice_encoded), 
                               columns=df_mice_encoded.columns)
print(df_mice_imputed.round(2))

print("\n" + "="*50)
print("COMPARISON SUMMARY")
print("="*50)
print(f"Original dataset shape: {df.shape}")
print(f"After dropping rows: {df_drop_rows.shape}")
print(f"After dropping columns: {df_drop_cols.shape}")
print("Imputation methods preserve original dataset shape!")

ModuleNotFoundError: No module named 'sklearn'