In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Step 1: Create sample data with missing values in 'Age'
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 35, 40, np.nan],
    'Height': [170, 165, 180, 175, 160, 185, 178, 168],
    'Weight': [70, 60, 80, 75, 55, 85, 77, 65],
    'Salary': [50000, 48000, 52000, 51000, 47000, 53000, 54000, 49000]
}
df = pd.DataFrame(data)

print("Original DataFrame with missing Age values:")
print(df)

# Step 2: Define feature columns (no missing values here)
feature_cols = ['Height', 'Weight', 'Salary']

# Step 3: Split dataset into rows with known 'Age' and missing 'Age'
train_df = df[df['Age'].notnull()]
predict_df = df[df['Age'].isnull()]

# Step 4: Prepare features and target for training
X_train = train_df[feature_cols]
y_train = train_df['Age']

# Features for prediction (missing Age rows)
X_predict = predict_df[feature_cols]

# Step 5: Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Predict missing 'Age' values
predicted_ages = model.predict(X_predict)

# Step 7: Fill predicted values back into the original DataFrame
df.loc[df['Age'].isnull(), 'Age'] = predicted_ages

print("\nDataFrame after Predictive Imputation of 'Age':")
print(df)

Original DataFrame with missing Age values:
    Age  Height  Weight  Salary
0  25.0     170      70   50000
1   NaN     165      60   48000
2  30.0     180      80   52000
3  22.0     175      75   51000
4   NaN     160      55   47000
5  35.0     185      85   53000
6  40.0     178      77   54000
7   NaN     168      65   49000

DataFrame after Predictive Imputation of 'Age':
     Age  Height  Weight  Salary
0  25.00     170      70   50000
1  25.09     165      60   48000
2  30.00     180      80   52000
3  22.00     175      75   51000
4  25.09     160      55   47000
5  35.00     185      85   53000
6  40.00     178      77   54000
7  25.09     168      65   49000
