In [1]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.



In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Sample dataset with missing values in 'Age'
data = {
    'Age': [25, 30, None, 22, 40, None, 35],
    'Salary': [50000, 60000, 55000, 52000, 70000, 58000, 62000],
    'Experience': [2, 5, 4, 1, 8, 3, 6]
}

df = pd.DataFrame(data)

# Separate rows where 'Age' is missing and not missing
df_missing = df[df['Age'].isnull()]
df_not_missing = df[df['Age'].notnull()]

# Features for prediction (all except 'Age')
features = ['Salary', 'Experience']

# Training data
X_train = df_not_missing[features]
y_train = df_not_missing['Age']

# Data to predict
X_predict = df_missing[features]

# Build and train model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict missing ages
predicted_ages = model.predict(X_predict)

# Fill the missing values in original dataframe
df.loc[df['Age'].isnull(), 'Age'] = predicted_ages

print("Data after imputation:")
print(df)


Data after imputation:
     Age  Salary  Experience
0  25.00   50000           2
1  30.00   60000           5
2  26.04   55000           4
3  22.00   52000           1
4  40.00   70000           8
5  27.80   58000           3
6  35.00   62000           6
