# Finding outliers with the IQR proximity rule

In [1]:
import numpy as np
import pandas as pd

# boston house dataset for the demo
from sklearn.datasets import fetch_california_housing

In [2]:
# load the California House price data from Scikit-learn
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# display top 5 rows
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
def find_limits(df, variable, fold):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_limit = df[variable].quantile(0.25) - (IQR * fold)
    upper_limit = df[variable].quantile(0.75) + (IQR * fold)

    return lower_limit, upper_limit

In [4]:
# we find the limits

lower_limit, upper_limit = find_limits(X, "MedInc", 3)
lower_limit, upper_limit

(-3.9761500000000005, 11.2828)

In [5]:
# let's flag the outliers in the data set

outliers = np.where(
    (X["MedInc"] > upper_limit) |
    (X["MedInc"] < lower_limit),
    True,
    False,
)

In [6]:
# how many outliers did we find?

outliers.sum()

140

In [7]:
# we find the limits in another variable

lower_limit, upper_limit = find_limits(X, "HouseAge", 3)
lower_limit, upper_limit

(-39.0, 94.0)

In [8]:
# let's flag the outliers in the data set

outliers = np.where(
    (X["HouseAge"] > upper_limit) |
    (X["HouseAge"] < lower_limit),
    True,
    False,
)

In [9]:
# how many outliers did we find?

outliers.sum()

0