In [1]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.


import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Sample data with missing values in 'target' column
data = {
    'feature1': [10, 20, 30, 40, 50, 60, 70, 80],
    'feature2': [1, 3, 5, 7, 9, 11, 13, 15],
    'target': [100, 200, None, 400, 500, None, 700, 800]
}
df = pd.DataFrame(data)

# Separate rows where 'target' is missing and where it's present
df_train = df[df['target'].notna()]
df_missing = df[df['target'].isna()]

# Features and target for training
X_train = df_train[['feature1', 'feature2']]
y_train = df_train['target']

# Features for missing rows
X_missing = df_missing[['feature1', 'feature2']]

# Train a regression model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict missing target values
predicted_values = model.predict(X_missing)

# Impute missing values in original dataframe
df.loc[df['target'].isna(), 'target'] = predicted_values

print("Data after imputation:")
print(df)



Data after imputation:
   feature1  feature2  target
0        10         1   100.0
1        20         3   200.0
2        30         5   246.0
3        40         7   400.0
4        50         9   500.0
5        60        11   563.0
6        70        13   700.0
7        80        15   800.0
