<h1>Handling Missing Values</h1>

<h2>1. Deletion Method</h2>

In [47]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

In [48]:
# Read a data
data = fetch_california_housing()

In [49]:
# Convert to pandas
df=pd.DataFrame(data.data , columns=data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [50]:
# Checking for missing values
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [51]:
df.iloc[::10]=None
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [52]:
# Dropping missing values
df=df.dropna()
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

<h2>2. Mean/Median Imputation</h2>

In [53]:
from sklearn.datasets import fetch_california_housing 
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [56]:
# Read data
data=fetch_california_housing()

In [57]:
# Convert it into dataframe
df=pd.DataFrame(data.data, columns=data.feature_names) 
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [58]:
# Check for missing values
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [59]:
# put some missing values into the dataframe
df.iloc[::10]=np.nan
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [60]:
# Fill missing values with the mean of the column
imputer = SimpleImputer(strategy='mean') 
df_imputed = imputer.fit_transform(df)

In [61]:
# Convert the imputed data back to a Pandas DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

<h2>3. Regression Imputation</h2>

In [62]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression 
import pandas as pd
import numpy as np

In [63]:
# Read data
data=fetch_california_housing()
# Convert it into dataframe
df=pd.DataFrame(data.data, columns=data.feature_names) 
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [64]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [65]:
# Put some missing data
df.iloc[10:20, 0]=None
df.isnull().sum()

MedInc        10
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
dtype: int64

In [66]:
# Split the dataset into with and without missing values
x_missing=df[df.isna().any(axis=1)] 
x_no_missing=df.dropna()

In [67]:
x_train=x_no_missing.drop(columns=['MedInc']) 
y_train=x_no_missing['MedInc'] 
x_test=x_missing.drop(columns=['MedInc'])

In [68]:
# Define a model
model=LinearRegression()
model.fit(x_train,y_train) # Fit a model with features and targetr as MedInc

In [69]:
# Impute missing values using the trained model
x_missing['MedInc']=model.predict(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_missing['MedInc']=model.predict(x_test)


In [70]:
# Concatenate the two datasets back together
x_imputed = pd.concat([x_missing, x_no_missing], axis=0)

In [71]:
# Check if there are any missing values left
print(x_imputed.isnull().sum())

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64


<h2>4. Using Interpolation Method</h2>

In [72]:
from sklearn.datasets import load_iris 
import pandas as pd

In [73]:
# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names) # features y = data.target # target

In [75]:
 # Add some missing values
X.iloc[10:20, 0] = None 
X.iloc[20:30, 2] = None
df.isnull().sum()

MedInc        10
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
dtype: int64

In [76]:
# Interpolate missing values using linear interpolation 
X = X.interpolate(method='linear') # simple
df.isnull().sum()

MedInc        10
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
dtype: int64

<h2>5. Using Multiple Interpolation Method</h2>

In [77]:
from sklearn.datasets import load_iris 
import pandas as pd

In [78]:
 # Read dataset
x=load_iris() 
y=x.target

In [79]:
 # Convert it into dataframe
x=pd.DataFrame(x.data, columns=x.feature_names)
x.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [80]:
x.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [81]:
# Put some missing values
x.iloc[10:20,0]=None
x.isnull().sum()

sepal length (cm)    10
sepal width (cm)      0
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [82]:
# Interpolate missing values using different interpolation methods
x['sepal length (cm)'] = x['sepal length (cm)'].interpolate(method='linear')
x['sepal width (cm)'] = x['sepal width (cm)'].interpolate(method='quadratic')
x.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

<h2>6. Multiple Imputation Technique</h2>

In [83]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_iterative_imputer # iteration
from sklearn.impute import IterativeImputer # mutltiple imputer 
import pandas as pd

In [84]:
# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

In [85]:
# Add some missing values
X.iloc[10:20, 0] = None
# Impute missing values using IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0) 
X_imputed = imp.fit_transform(X)
# Check if there are any missing values left
pd.DataFrame(X_imputed).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64