In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Create sample dataset with missing values
data = {
    'Age': [25, 30, np.nan, 35, 40, np.nan, 45],
    'Salary': [50000, 60000, 55000, np.nan, 65000, 62000, np.nan],
    'Experience': [2, 5, 3, 7, np.nan, 6, 8]
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

num_cols = df.columns

# 4. SimpleImputer with mean strategy
simple_imputer = SimpleImputer(strategy='mean')
df_simple_imputed = pd.DataFrame(simple_imputer.fit_transform(df[num_cols]), columns=num_cols)

print("\nData after SimpleImputer (mean):")
print(df_simple_imputed)

# 5. Regression-based imputation for 'Age'
target_col = 'Age'

train_df = df[df[target_col].notnull()]
predict_df = df[df[target_col].isnull()]

if not predict_df.empty:
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
    X_predict = predict_df.drop(columns=[target_col])
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    predicted_values = model.predict(X_predict)
    
    df_reg_imputed = df.copy()
    df_reg_imputed.loc[df_reg_imputed[target_col].isnull(), target_col] = predicted_values
    
    print(f"\nData after regression-based imputation for '{target_col}':")
    print(df_reg_imputed)
else:
    print(f"\nNo missing values in '{target_col}' to impute with regression.")

# 6. KNN Imputer
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df[num_cols]), columns=num_cols)

print("\nData after KNN Imputer:")
print(df_knn_imputed)

Original Data:
    Age   Salary  Experience
0  25.0  50000.0         2.0
1  30.0  60000.0         5.0
2   NaN  55000.0         3.0
3  35.0      NaN         7.0
4  40.0  65000.0         NaN
5   NaN  62000.0         6.0
6  45.0      NaN         8.0

Data after SimpleImputer (mean):
    Age   Salary  Experience
0  25.0  50000.0    2.000000
1  30.0  60000.0    5.000000
2  35.0  55000.0    3.000000
3  35.0  58400.0    7.000000
4  40.0  65000.0    5.166667
5  35.0  62000.0    6.000000
6  45.0  58400.0    8.000000

Data after regression-based imputation for 'Age':
         Age   Salary  Experience
0  25.000000  50000.0         2.0
1  30.000000  60000.0         5.0
2  26.903333  55000.0         3.0
3  35.000000      NaN         7.0
4  40.000000  65000.0         NaN
5  32.328333  62000.0         6.0
6  45.000000      NaN         8.0

Data after KNN Imputer:
    Age   Salary  Experience
0  25.0  50000.0         2.0
1  30.0  60000.0         5.0
2  40.0  55000.0         3.0
3  35.0  61000.0       

In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Create sample dataset with missing values
data = {
    'Age': [25, 30, np.nan, 35, 40, np.nan, 45],
    'Salary': [50000, 60000, 55000, np.nan, 65000, 62000, np.nan],
    'Experience': [2, 5, 3, 7, np.nan, 6, 8]
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

num_cols = df.columns

# 4. SimpleImputer with mean strategy
simple_imputer = SimpleImputer(strategy='mean')
df_simple_imputed = pd.DataFrame(simple_imputer.fit_transform(df[num_cols]), columns=num_cols)

print("\nData after SimpleImputer (mean):")
print(df_simple_imputed)

# 5. Regression-based imputation for 'Age'
target_col = 'Age'

train_df = df[df[target_col].notnull()]
predict_df = df[df[target_col].isnull()]

if not predict_df.empty:
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
    X_predict = predict_df.drop(columns=[target_col])
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    predicted_values = model.predict(X_predict)
    
    df_reg_imputed = df.copy()
    df_reg_imputed.loc[df_reg_imputed[target_col].isnull(), target_col] = predicted_values
    
    print(f"\nData after regression-based imputation for '{target_col}':")
    print(df_reg_imputed)
else:
    print(f"\nNo missing values in '{target_col}' to impute with regression.")

# 6. KNN Imputer
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df[num_cols]), columns=num_cols)

print("\nData after KNN Imputer:")
print(df_knn_imputed)

Original Data:
    Age   Salary  Experience
0  25.0  50000.0         2.0
1  30.0  60000.0         5.0
2   NaN  55000.0         3.0
3  35.0      NaN         7.0
4  40.0  65000.0         NaN
5   NaN  62000.0         6.0
6  45.0      NaN         8.0

Data after SimpleImputer (mean):
    Age   Salary  Experience
0  25.0  50000.0    2.000000
1  30.0  60000.0    5.000000
2  35.0  55000.0    3.000000
3  35.0  58400.0    7.000000
4  40.0  65000.0    5.166667
5  35.0  62000.0    6.000000
6  45.0  58400.0    8.000000



Data after regression-based imputation for 'Age':
         Age   Salary  Experience
0  25.000000  50000.0         2.0
1  30.000000  60000.0         5.0
2  26.903333  55000.0         3.0
3  35.000000      NaN         7.0
4  40.000000  65000.0         NaN
5  32.328333  62000.0         6.0
6  45.000000      NaN         8.0

Data after KNN Imputer:
    Age   Salary  Experience
0  25.0  50000.0         2.0
1  30.0  60000.0         5.0
2  40.0  55000.0         3.0
3  35.0  61000.0         7.0
4  40.0  65000.0         7.5
5  40.0  62000.0         6.0
6  45.0  58500.0         8.0
