In [1]:
def detect_discrete_outliers(data, types, threshold):
    data.dropna(inplace = True)
    
    if types == "IQR":
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        mask1 = Q1 - 1.5 * IQR
        mask2 = Q3 + 1.5 * IQR
        outliers = (data < mask1) | (data > mask2)
    elif types == "Z":
        mean = data.mean()
        std = data.std()
        z_score = (data - mean)/std 
        outliers = abs(z_score) > threshold
    else:
        raise Warning("Only 2 types: IQR or Z")

    return outliers

def clean_outliers(data, types = "IQR", threshold = 3):
    '''
    This function will cleanse outliers only
    and leave missing values alone.
    '''
    df = data.copy()
    list_of_outliers = [False]*df.shape[0]
    for x in df.columns:
        list_of_outliers |= detect_discrete_outliers(df[x], types, 3)
    list_of_outliers
    return df[-list_of_outliers]

In [2]:
# Create test data
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randn(100).reshape(20,5), index=range(5,25))

df.iloc[5,3] = np.nan
df.iloc[8,2] = np.nan
df.iloc[11,1] = np.nan

df.iloc[4,1] = 10
df.iloc[6,2] = 15
df.iloc[19,4] = 33

df

Unnamed: 0,0,1,2,3,4
5,-0.64234,-0.413989,-1.644592,0.975346,1.722568
6,-0.192799,-1.748214,-0.514296,-0.313692,0.999975
7,-3.311296,1.304728,0.89235,-2.370398,0.948591
8,-1.411399,0.322231,0.624675,0.083492,-0.984289
9,-0.707767,10.0,3.713728,-0.524298,0.934491
10,0.122989,-1.955259,-0.466697,,-0.045226
11,1.309722,-0.700264,15.0,-0.309723,-1.481308
12,-1.581596,-0.083902,0.257302,-0.630449,0.904623
13,0.685175,-0.530248,,2.752155,0.723282
14,-0.259762,0.510162,1.179468,0.702884,-0.936503


In [3]:
clean_outliers(data=df, types = "IQR", threshold=3)

Unnamed: 0,0,1,2,3,4
5,-0.64234,-0.413989,-1.644592,0.975346,1.722568
6,-0.192799,-1.748214,-0.514296,-0.313692,0.999975
8,-1.411399,0.322231,0.624675,0.083492,-0.984289
10,0.122989,-1.955259,-0.466697,,-0.045226
12,-1.581596,-0.083902,0.257302,-0.630449,0.904623
14,-0.259762,0.510162,1.179468,0.702884,-0.936503
15,-0.050789,0.1025,1.046241,-0.725637,-0.557013
17,0.060791,-0.143386,1.092907,-0.041928,-0.990554
18,0.880698,-0.681162,-0.572442,1.051923,-1.234979
19,0.692174,0.999577,-0.553869,-0.168916,1.397119


In [4]:
clean_outliers(data=df, types = "Z", threshold=3)

Unnamed: 0,0,1,2,3,4
5,-0.64234,-0.413989,-1.644592,0.975346,1.722568
6,-0.192799,-1.748214,-0.514296,-0.313692,0.999975
7,-3.311296,1.304728,0.89235,-2.370398,0.948591
8,-1.411399,0.322231,0.624675,0.083492,-0.984289
10,0.122989,-1.955259,-0.466697,,-0.045226
12,-1.581596,-0.083902,0.257302,-0.630449,0.904623
13,0.685175,-0.530248,,2.752155,0.723282
14,-0.259762,0.510162,1.179468,0.702884,-0.936503
15,-0.050789,0.1025,1.046241,-0.725637,-0.557013
16,-2.743307,,1.334564,-1.792302,-1.79298


In [5]:
clean_outliers(data=df, types = "I", threshold=3)

Warning: Only 2 types: IQR or Z