# Capping outliers at quantiles

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from feature_engine.outliers import Winsorizer

In [2]:
# load the California House price data from Scikit-learn
breast_cancer = load_breast_cancer()
X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
y = breast_cancer.target

# let's separate the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((398, 30), (171, 30))

In [3]:
X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
478,11.49,14.59,73.99,404.9,0.1046,0.08228,0.05308,0.01969,0.1779,0.06574,...,12.4,21.9,82.04,467.6,0.1352,0.201,0.2596,0.07431,0.2941,0.0918
303,10.49,18.61,66.86,334.3,0.1068,0.06678,0.02297,0.0178,0.1482,0.066,...,11.06,24.54,70.76,375.4,0.1413,0.1044,0.08423,0.06528,0.2213,0.07842
155,12.25,17.94,78.27,460.3,0.08654,0.06679,0.03885,0.02331,0.197,0.06228,...,13.59,25.22,86.6,564.2,0.1217,0.1788,0.1943,0.08211,0.3113,0.08132
186,18.31,18.58,118.6,1041.0,0.08588,0.08468,0.08169,0.05814,0.1621,0.05425,...,21.31,26.36,139.2,1410.0,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
101,6.981,13.43,43.79,143.5,0.117,0.07568,0.0,0.0,0.193,0.07818,...,7.93,19.54,50.41,185.2,0.1584,0.1202,0.0,0.0,0.2932,0.09382


In [4]:
# lower limit
X_train.quantile(0.05)

mean radius                  9.714450
mean texture                13.227000
mean perimeter              61.619000
mean area                  288.950000
mean smoothness              0.074728
mean compactness             0.041967
mean concavity               0.005061
mean concave points          0.006397
mean symmetry                0.139395
mean fractal dimension       0.053887
radius error                 0.163765
texture error                0.533290
perimeter error              1.162150
area error                  11.582000
smoothness error             0.003786
compactness error            0.008053
concavity error              0.003821
concave points error         0.004048
symmetry error               0.011901
fractal dimension error      0.001540
worst radius                10.826500
worst texture               16.797500
worst perimeter             69.312500
worst area                 354.960000
worst smoothness             0.096053
worst compactness            0.070831
worst concav

In [5]:
# upper limit
X_train.quantile(0.95)

mean radius                  20.597500
mean texture                 27.000500
mean perimeter              136.095000
mean area                  1312.350000
mean smoothness               0.118600
mean compactness              0.208445
mean concavity                0.241700
mean concave points           0.127915
mean symmetry                 0.231465
mean fractal dimension        0.074880
radius error                  0.980675
texture error                 2.192800
perimeter error               7.066200
area error                  118.875000
smoothness error              0.012370
compactness error             0.058603
concavity error               0.077445
concave points error          0.022367
symmetry error                0.034802
fractal dimension error       0.007615
worst radius                 25.943500
worst texture                36.277500
worst perimeter             177.060000
worst area                 2056.000000
worst smoothness              0.173215
worst compactness        

In [6]:
# Cap outliers

for variable in X_train.columns:
    lower_limit = X_train[variable].quantile(0.05)
    upper_limit = X_train[variable].quantile(0.95)

    X_train[variable].clip(lower=lower_limit, upper=upper_limit, inplace=True)
    X_test[variable].clip(lower=lower_limit, upper=upper_limit, inplace=True)

X_train["worst smoothness"].min(), X_train["worst smoothness"].max()

(0.0960535, 0.17321499999999998)

## Feature-engine

In [7]:
# let's separate the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((398, 30), (171, 30))

In [8]:
capper = Winsorizer(
    variables=["worst smoothness", "worst texture"],
    capping_method="quantiles",
    tail="both",
    fold=0.05,
)

capper.fit(X_train)

In [9]:
capper.left_tail_caps_

{'worst smoothness': 0.0960535, 'worst texture': 16.7975}

In [10]:
capper.right_tail_caps_

{'worst smoothness': 0.17321499999999998, 'worst texture': 36.2775}

In [11]:
X_train = capper.transform(X_train)
X_test = capper.transform(X_test)

X_train[capper.variables_].min(), X_train[capper.variables_].max()

(worst smoothness     0.096053
 worst texture       16.797500
 dtype: float64,
 worst smoothness     0.173215
 worst texture       36.277500
 dtype: float64)