In [None]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.



In [None]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load data
data = pd.read_csv('data.csv')

# Mark missing values (e.g., zeros or NaNs)
data.replace(0, np.nan, inplace=True)

# Define the imputer
imputer = IterativeImputer(random_state=0)

# Fit and transform the data to impute missing values
imputed_data = imputer.fit_transform(data)

# Convert back to DataFrame if needed
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)


In [2]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import fetch_california_housing  # Example dataset

def impute_dataframe(data, missing_values_to_replace=None, impute_columns=None, random_state=0):
    """
    Perform iterative imputation on a DataFrame.

    Parameters:
    - data: pandas DataFrame.
    - missing_values_to_replace: list or set, values to treat as missing (default None).
    - impute_columns: list, columns to impute (default None means all columns).
    - random_state: int, seed for reproducibility.

    Returns:
    - imputed_df: DataFrame with imputed values.
    """

    # Replace specified values with NaN
    if missing_values_to_replace is not None:
        for val in missing_values_to_replace:
            data.replace(val, np.nan, inplace=True)

    # Validate columns for imputation
    if impute_columns is None:
        impute_columns = data.columns[data.isnull().any()].tolist()

    # Subset data for imputation
    impute_data = data[impute_columns]

    # Initialize IterativeImputer
    imputer = IterativeImputer(random_state=random_state)

    # Fit and transform imputation columns
    imputed_array = imputer.fit_transform(impute_data)

    # Replace imputed columns in original DataFrame
    data.loc[:, impute_columns] = imputed_array

    return data

# Load dataset from sklearn
california = fetch_california_housing(as_frame=True)
df = california.frame

# Introduce some missing values artificially for demonstration
np.random.seed(0)
missing_mask = np.random.rand(*df.shape) < 0.1  # 10% missingness randomly
df_where_missing = df.mask(missing_mask)

# Perform imputation
imputed_df = impute_dataframe(df_where_missing)

print(imputed_df.head())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup   Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556  37.880000   
1  8.3014      21.0  6.238137   0.971880      2401.0  6.026754  34.959894   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260  37.850000   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945  37.850000   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467  37.850000   

    Longitude  MedHouseVal  
0 -122.230000        4.526  
1 -119.400788        3.585  
2 -122.240000        3.521  
3 -122.293794        3.413  
4 -122.173675        3.422  
