# Capping / Censoring outliers

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from feature_engine.outliers import Winsorizer

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# let's separate the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((398, 30), (171, 30))

In [4]:
def find_limits(df, variable, fold):
    var_mean = df[variable].mean()
    var_std = df[variable].std()
    lower_limit = var_mean - fold * var_std
    upper_limit = var_mean + fold * var_std
    return lower_limit, upper_limit

In [5]:
# we find the limits
var = "worst smoothness"

lower_limit, upper_limit = find_limits(X_train, var, 3)

lower_limit, upper_limit

(0.06356074164705164, 0.20149734880520967)

In [6]:
train_t = X_train.copy()
test_t = X_test.copy()

In [7]:
# Cap outliers

train_t[var] = train_t[var].clip(
    lower=lower_limit, upper=upper_limit)

test_t[var] = test_t[var].clip(
    lower=lower_limit, upper=upper_limit)

In [8]:
X_train[var].agg(["min", "max"])

min    0.07117
max    0.22260
Name: worst smoothness, dtype: float64

In [9]:
train_t["worst smoothness"].agg(["min", "max"])

min    0.071170
max    0.201497
Name: worst smoothness, dtype: float64

## Feature-engine

In [10]:
capper = Winsorizer(
    variables=["worst smoothness", "worst texture"],
    capping_method="gaussian",
    tail="both",
    fold=3,
)

capper.fit(X_train)

In [11]:
capper.left_tail_caps_

{'worst smoothness': 0.06364743973736293, 'worst texture': 7.115307053129349}

In [12]:
capper.right_tail_caps_

{'worst smoothness': 0.2014106507148984, 'worst texture': 43.95373817300131}

In [13]:
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)

In [14]:
print(X_train[capper.variables_].agg(["min", "max"]))

     worst smoothness  worst texture
min           0.07117          12.02
max           0.22260          49.54


In [15]:
print(train_t[capper.variables_].agg(["min", "max"]))

     worst smoothness  worst texture
min          0.071170      12.020000
max          0.201411      43.953738
