In [31]:
#Handling missing numerical using algorithm like regression and knn

import pandas as pd
import numpy as np

data = {
    'Bedrooms' : [2, 1, 4, np.nan, 5, np.nan ,8],
    'Bathrooms' : [np.nan, 3, 2, 5, np.nan, 4,1],
    'sq_foot' : [100, np.nan, 200, 500, np.nan, 300, np.nan],
    'Price': [200000, 300000, np.nan, 250000, 500000, 700000, 400000]
}

df = pd.DataFrame(data)

In [32]:
df

Unnamed: 0,Bedrooms,Bathrooms,sq_foot,Price
0,2.0,,100.0,200000.0
1,1.0,3.0,,300000.0
2,4.0,2.0,200.0,
3,,5.0,500.0,250000.0
4,5.0,,,500000.0
5,,4.0,300.0,700000.0
6,8.0,1.0,,400000.0


In [35]:
from sklearn.impute import KNNImputer

#conver the dataset to a numpy array from knn

imputer = KNNImputer(n_neighbors = 2)

# perform KNN imputation

df_knn = imputer.fit_transform(df[['Bedrooms', 'Bathrooms', 'sq_foot']])

#convert back to the dataframe for easier interpretation

df_knn1 = pd.DataFrame(df_knn, columns = ['Bedrooms', 'Bathrooms', 'sq_foot'])

In [36]:
df_knn1

Unnamed: 0,Bedrooms,Bathrooms,sq_foot
0,2.0,2.0,100.0
1,1.0,3.0,200.0
2,4.0,2.0,200.0
3,4.5,5.0,500.0
4,5.0,1.5,150.0
5,4.5,4.0,300.0
6,8.0,1.0,250.0


In [37]:
# optimal improvement to the above algorithm
# use standard scaler if the dataset is large value 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(
          StandardScaler(),
           KNNImputer(n_neighbors = 2)
)

impute_data = pipeline.fit_transform(df[['Bedrooms', 'Bathrooms', 'sq_foot']])
#revese the imputation process -> target columns should not be negative
#scaler = pipeline.named_steps['standardscaler']
#impute = scaler.inverse_transform(impute_data)
df_knn = pd.DataFrame(impute_data, columns = ['Bedrooms', 'Bathrooms', 'sq_foot'])
df_knn

Unnamed: 0,Bedrooms,Bathrooms,sq_foot
0,-0.816497,-0.353553,-1.183216
1,-1.224745,0.0,-0.507093
2,0.0,-0.707107,-0.507093
3,-0.612372,1.414214,1.521278
4,0.408248,-1.06066,-0.845154
5,-0.612372,0.707107,0.169031
6,1.632993,-1.414214,-0.169031


In [38]:
df_clean = pd.concat([df_knn1 , df['Price']], axis = 1)

In [39]:
# Regression Imputation (Example):

from sklearn.linear_model import LinearRegression
df_nomissing = df_clean[df_clean['Price'].notnull()]

#Separate known and unknown data

x = df_nomissing[['Bedrooms', 'Bathrooms', 'sq_foot']]
y = df_nomissing['Price']

#Train regression model to predict missing price

model = LinearRegression()

model.fit(x, y)

#predict missing value for rows where  'price' is nan

x_missing = df[df['Price'].isnull()][['Bedrooms', 'Bathrooms', 'sq_foot']]

pridicted_price = model.predict(x_missing)



In [40]:
pridicted_price

array([301165.56924384])

In [41]:
df_clean.loc[df_clean['Price'].isnull(), 'Price'] = pridicted_price

In [42]:
df_clean

Unnamed: 0,Bedrooms,Bathrooms,sq_foot,Price
0,2.0,2.0,100.0,200000.0
1,1.0,3.0,200.0,300000.0
2,4.0,2.0,200.0,301165.569244
3,4.5,5.0,500.0,250000.0
4,5.0,1.5,150.0,500000.0
5,4.5,4.0,300.0,700000.0
6,8.0,1.0,250.0,400000.0
