In [1]:
# Define function to detect outliers for numerical variables
def clean_outliers(data, types = "IQR", threshold = 3.0):
    '''
    This function will cleanse outliers only
    and leave missing values alone.
    
    Parameters:
        data (DataFrame): Raw data that need to detect and clean the outliers.
        types (str): Declare the method to detect outliers ("IQR" - interquartile range or "Z" - Z-score)
        threshold (floar or int): Declare the threshold when detect the outliers with Z-score.
    Returns:
        result (DataFrame): Cleaned data.
    '''

    def detect_discrete_outliers(data, types, threshold):
        data.dropna(inplace = True)
        
        if types == "IQR":
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            mask1 = Q1 - 1.5 * IQR
            mask2 = Q3 + 1.5 * IQR
            outliers = (data < mask1) | (data > mask2)
        elif types == "Z":
            mean = data.mean()
            std = data.std()
            z_score = (data - mean)/std 
            outliers = abs(z_score) > threshold
        else:
            raise Warning("Only 2 types: IQR or Z")
    
        return outliers
    
    df = data.copy()
    list_of_outliers = [False]*df.shape[0]
    for x in df:
        list_of_outliers |= detect_discrete_outliers(df[x], types, 3)
    result = df[-list_of_outliers]
    return result

### Test:

In [2]:
# Create test data
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randn(100).reshape(20,5), index=range(5,25))

df.iloc[5,3] = np.nan
df.iloc[8,2] = np.nan
df.iloc[11,1] = np.nan

df.iloc[4,1] = 10
df.iloc[6,2] = 15
df.iloc[19,4] = 33

df

Unnamed: 0,0,1,2,3,4
5,-0.028541,1.529365,-0.04051,0.81927,0.041689
6,-1.223723,-1.706463,-0.669168,0.792744,1.154751
7,-0.397869,-0.117151,-1.392212,1.265771,0.948511
8,1.082639,0.209075,1.255313,-1.562194,1.782077
9,-0.427627,10.0,-1.759546,0.80342,0.310316
10,2.70432,0.319401,-1.615452,,-1.104842
11,0.27307,1.225561,15.0,1.968553,-0.393373
12,-0.331732,-0.142921,-1.485617,0.387534,-0.883272
13,-0.034584,2.807816,,0.962811,2.237374
14,0.200647,-0.435053,0.625997,-1.36,-1.352491


In [3]:
type(3.5)

float

In [4]:
clean_outliers(data=df, types = "IQR", threshold=3)

Unnamed: 0,0,1,2,3,4
5,-0.028541,1.529365,-0.04051,0.81927,0.041689
6,-1.223723,-1.706463,-0.669168,0.792744,1.154751
7,-0.397869,-0.117151,-1.392212,1.265771,0.948511
8,1.082639,0.209075,1.255313,-1.562194,1.782077
12,-0.331732,-0.142921,-1.485617,0.387534,-0.883272
14,0.200647,-0.435053,0.625997,-1.36,-1.352491
15,1.292708,-0.729976,-0.350617,-0.674917,-0.118224
18,-0.228613,0.083118,0.370246,0.501226,-0.61292
20,0.006541,-0.395629,0.839575,-0.16789,1.166928
21,-0.203533,-1.733432,0.233834,0.057708,0.554831


In [5]:
clean_outliers(data=df, types = "Z", threshold=3)

Unnamed: 0,0,1,2,3,4
5,-0.028541,1.529365,-0.04051,0.81927,0.041689
6,-1.223723,-1.706463,-0.669168,0.792744,1.154751
7,-0.397869,-0.117151,-1.392212,1.265771,0.948511
8,1.082639,0.209075,1.255313,-1.562194,1.782077
10,2.70432,0.319401,-1.615452,,-1.104842
12,-0.331732,-0.142921,-1.485617,0.387534,-0.883272
13,-0.034584,2.807816,,0.962811,2.237374
14,0.200647,-0.435053,0.625997,-1.36,-1.352491
15,1.292708,-0.729976,-0.350617,-0.674917,-0.118224
16,0.364252,,0.591008,2.552133,0.632127


In [6]:
clean_outliers(data=df, types = "I", threshold=3)

Warning: Only 2 types: IQR or Z