# A6: Imputation via Regression for Missing Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import numpy as np
print(np.__version__)


1.26.4


In [4]:
df = pd.read_csv("UCI_Credit_Card.csv")

In [5]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [6]:
df.isna().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

In [7]:
df.shape

(30000, 25)

In [8]:
np.random.seed(42)  # for reproducibility

cols_to_nan = ['AGE', 'BILL_AMT1', 'BILL_AMT2']  

# Choose fraction of missing data (5–10%)
missing_fraction = 0.07  # 7% missing for demo

for col in cols_to_nan:
    n_missing = int(missing_fraction * len(df))
    missing_indices = np.random.choice(df.index, n_missing, replace=False)
    df.loc[missing_indices, col] = np.nan

print("\nMissing values per column:")
print(df[cols_to_nan].isna().sum())


Missing values per column:
AGE          2100
BILL_AMT1    2100
BILL_AMT2    2100
dtype: int64


In [9]:
target = 'default.payment.next.month'  # as per UCI dataset column name
if target not in df.columns:
    print("\n Warning: Target column name might differ — check your CSV header.")
else:
    print("\nTarget column found:", target)



Target column found: default.payment.next.month


In [10]:
df.isna().sum()

ID                               0
LIMIT_BAL                        0
SEX                              0
EDUCATION                        0
MARRIAGE                         0
AGE                           2100
PAY_0                            0
PAY_2                            0
PAY_3                            0
PAY_4                            0
PAY_5                            0
PAY_6                            0
BILL_AMT1                     2100
BILL_AMT2                     2100
BILL_AMT3                        0
BILL_AMT4                        0
BILL_AMT5                        0
BILL_AMT6                        0
PAY_AMT1                         0
PAY_AMT2                         0
PAY_AMT3                         0
PAY_AMT4                         0
PAY_AMT5                         0
PAY_AMT6                         0
default.payment.next.month       0
dtype: int64

In [11]:
dataset_A = df.copy()

missing_cols = dataset_A.columns[dataset_A.isna().any()]
print("Columns with missing values:", list(missing_cols))

#  Filling missing values using Median Imputation 
for col in missing_cols:
    median_value = dataset_A[col].median()
    dataset_A[col].fillna(median_value, inplace=True)
    print(f"Filled missing values in '{col}' with median = {median_value}")

#  Confirm imputation 
print("\nAfter imputation, number of missing values per column:")
print(dataset_A.isnull().sum())

Columns with missing values: ['AGE', 'BILL_AMT1', 'BILL_AMT2']
Filled missing values in 'AGE' with median = 34.0
Filled missing values in 'BILL_AMT1' with median = 22476.0
Filled missing values in 'BILL_AMT2' with median = 21410.5

After imputation, number of missing values per column:
ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset_A[col].fillna(median_value, inplace=True)


Median is often preferred over Mean because it is more robust to outliers.
In datasets like credit card client data, columns such as 'AGE' or 'BILL_AMT' often contain skewed distributions ( a few very rich clients or extremely young ones).
If mean is used, those outliers will pull the imputed values toward the extremes, distorting the central tendency.
The median, however, stays solid. It represents the “middle” of the data and gives a more realistic and stable replacement for missing values in such cases.

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

dataset_B = df.copy()

#  Select target column for regression imputation 
target_col = 'AGE'

# Separate rows where target is missing and not missing
missing_mask = dataset_B[target_col].isna()
train_data = dataset_B[~missing_mask]
predict_data = dataset_B[missing_mask]


# Drop target + non-numeric columns
X_train = train_data.drop(columns=[target_col, 'default.payment.next.month'], errors='ignore')
y_train = train_data[target_col]
X_pred = predict_data.drop(columns=[target_col, 'default.payment.next.month'], errors='ignore')

# Keep only numeric columns
X_train = X_train.select_dtypes(include=[np.number])
X_pred = X_pred.select_dtypes(include=[np.number])


# Fill remaining NaNs in features with median (so regression can fit)
for col in X_train.columns:
    median_val = X_train[col].median()
    X_train[col].fillna(median_val, inplace=True)
    X_pred[col].fillna(median_val, inplace=True)

# Scale features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_pred_scaled = scaler.transform(X_pred)

# Train Linear Regression model 
reg_model = LinearRegression()
reg_model.fit(X_train_scaled, y_train)

# Predict missing AGE values
predicted_ages = reg_model.predict(X_pred_scaled)

# Fill missing AGE values back into Dataset B
dataset_B.loc[missing_mask, target_col] = predicted_ages

print(f"Number of missing values in '{target_col}' after regression imputation:",
      dataset_B[target_col].isna().sum())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_pred[col].fillna(median_val, inplace=True)


Number of missing values in 'AGE' after regression imputation: 0


The Underlying Assumption for Missing At Random (MAR):

Regression imputation assumes that the missingness in 'AGE' depends on other observed features, not on 'AGE' itself.

Some clients’ 'AGE' is missing because of how the data was collected ( younger users tend to skip entering their age),
but we can still predict it using other variables like 'LIMIT_BAL', 'EDUCATION', or 'PAY_0'.

That’s what “Missing At Random” means: The probability of missingness depends only on other known data, not the missing value itself.

If the data were Not Missing At Random (NMAR), say, older people intentionally hide their age then regression imputation wouldn’t fix that bias.

In [15]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor


# --- Step 2: Create Dataset C ---
dataset_C = df.copy()

# --- Step 3: Select target column for imputation ---
target_col = 'AGE'

# Separate rows where target is missing and not missing
missing_mask = dataset_C[target_col].isna()
train_data = dataset_C[~missing_mask]
predict_data = dataset_C[missing_mask]

# --- Step 4: Prepare features ---
# Drop target and non-numeric columns
X_train = train_data.drop(columns=[target_col, 'default.payment.next.month'], errors='ignore')
y_train = train_data[target_col]
X_pred = predict_data.drop(columns=[target_col, 'default.payment.next.month'], errors='ignore')

# Keep numeric columns
X_train = X_train.select_dtypes(include=[np.number])
X_pred = X_pred.select_dtypes(include=[np.number])

# --- Step 5: Fill NaNs in features (median) ---
for col in X_train.columns:
    median_val = X_train[col].median()
    X_train[col].fillna(median_val, inplace=True)
    X_pred[col].fillna(median_val, inplace=True)

# --- Step 6: Scale features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_pred_scaled = scaler.transform(X_pred)

# --- OPTION A: Use K-Nearest Neighbors Regressor ---
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
predicted_ages_knn = knn_model.predict(X_pred_scaled)

# --- Fill missing values ---
dataset_C.loc[missing_mask, target_col] = predicted_ages_knn

print(f"Number of missing values in '{target_col}' after KNN imputation:",
      dataset_C[target_col].isna().sum())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_pred[col].fillna(median_val, inplace=True)


ValueError: Found array with 0 sample(s) (shape=(0, 23)) while a minimum of 1 is required by StandardScaler.