### 1. Trimming outliers from the dataset


In [None]:
def outliers_table(df): 
    df = df[df.dtypes[df.dtypes != object].index]
    d = dict()
    l1 = []
    l2 = []
    l3 = []
    l4 = []
    l5 = []
    for feature in df.columns : 
        IQR = np.quantile(df[feature],0.75) - np.quantile(df[feature],0.25)
        upper_boundary = np.quantile(df[feature],0.75) + (IQR *1.5)
        lower_boundary = np.quantile(df[feature],0.25) - (IQR *1.5)
        l1.append(feature)
        l2.append(upper_boundary)
        l3.append(lower_boundary)
        l4.append(IQR)
        count = df[feature][(df[feature] > upper_boundary) | (df[feature] < lower_boundary)].count()
        l5.append(count)
    d['feature'] = l1 
    d['upper_boundary'] = l2 
    d['lower_boundary'] = l3 
    d['IQR'] = l4
    d['Outliers_count'] = l5
    outliers_table = pd.DataFrame(d)
    return outliers_table 

## Now trim the data which have outliers 
outliers_RM = np.where(df['RM'] > outliers_table(df).loc[5,'upper_boundary'], True, 
                       np.where(df['RM'] < outliers_table(df).loc[5,'lower_boundary'], True, False))
""" 'RM' --> "Column of Boston house table " """

df.loc[~(outliers_RM)]

In [None]:
## We can also do that : 
lower_boundary = df[variable].mean() - (df[variable].std() *distance)
upper_boundary = df[variable].mean() + (df[variable].std() *distance)

### 2. Performing winsorization
- Winsorization, or winsorizing, is the process of transforming the data by limiting the extreme values, that is, the outliers, to a certain arbitrary value, closer to the mean of the distribution. Winsorizing is different from trimming because the extreme values are not removed, but are instead replaced by other values. A typical strategy involves setting outliers to a specified percentile.

- For example, with 90% winsorization, we set all data below the 5th percentile to the value at the 5th percentile and all data above the 95th percentile to the value at the 95th percentile

In [None]:
def winsorize(df, variable, upper_limit, lower_limit):
     return np.where(df[variable] > upper_limit, upper_limit,
                     np.where(df[variable] < lower_limit, lower_limit,df[variable]))
variable = 'RM'
upper_limit = outliers_table(df).loc[5,'upper_boundary']
lower_limit = outliers_table(df).loc[5,'lower_boundary']

winsorize(df,'RM',upper_limit ,lower_limit)

In [None]:
## Also with feature engine library . 
from feature_engine.outliers import Winsorizer
windsorizer = Winsorizer( tail='both',variables=['RM', 'LSTAT', 'CRIM'], 
                        capping_method = 'quantiles',fold=0.05)
## Fold = 0.05 means 5% and 95% , fold = 0.25 means 25% and 75% 

windsorizer.fit_transform(df)

### 3.Capping the variable at arbitrary maximum and minimum values
- Similarly to winsorization, we can replace the extreme values by values closer to other values in the variable, by determining the maximum and minimum boundaries with the mean plus or minus the standard deviation, or the inter-quartile range proximity rule. This procedure is also called bottom and top coding, censoring, or capping. We can cap both extremes of the distribution or just one of the tails, depending on where we find the outliers in the variable.


In [None]:
from feature_engine.outliers import Winsorizer
windsorizer = Winsorizer(distribution='gaussian', tail='both',
fold=3, variables=['RM', 'LSTAT', 'CRIM'])


### 4.Performing zero-coding – capping the variable at zero
- In econometrics and statistics, top-coding and bottom-coding refer to the act of censoring data points, the values of which are above or below a certain number or threshold, respectively.

-  Zero-coding is a variant of bottom-coding and refers to the process of capping, usually the lower value of the variable, at zero. It is commonly used for variables that cannot take negative values, such as age or income.


In [1]:
from feature_engine.outliers import ArbitraryOutlierCapper
windsorizer = ArbitraryOutlierCapper(max_capping_dict=None, 
                                     min_capping_dict={'x':0, 'y':0, 'z':0})

## if values are below lower boundary then they are mapped to zeros 

