## Load data

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import random
import numpy as np
random.seed(0)

In [2]:
# Fetch dataset
dataset = fetch_california_housing()
train, target = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
train.columns = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven']
train.insert(loc=len(train.columns), column='target', value=target)

In [3]:
# Inject NaN values in data
column = train['zero']
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN

In [14]:
train.shape

(20640, 9)

## Drop Features with Missing Values

In [16]:
zero_dropna = train.dropna(inplace=False)
zero_dropna.shape

(20640, 9)

## Median/Mean/Most Frequent Value Replacement

In [27]:
# Impute values (You can use an imputer to do so! They impute (replace) values for you)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean') # there are also other options for strategy too `mean` `most_frequent` (including that for categorical features)
imputer = imputer.fit(train[['zero']]) # train[['zero']] ensures that pd.Series is pd.DataFrame for method.
zero_mean = train.copy()
zero_mean['zero'] = imputer.transform(zero_mean[['zero']]).ravel() # transforms the zero column by unravelling the results of imputation (creating a vector) that can be assigned to zero column
zero_mean

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


## KNN 

In [None]:
# !pip install --upgrade pip

In [5]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(train[["zero"]])
zero_knn = train.copy()
zero_knn["zero"] = imputer.transform(zero_knn[["zero"]]).ravel()
zero_knn

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,3.87794,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.30140,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,3.87794,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.64310,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.87794,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.56030,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.55680,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.70000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,3.87794,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847
