In [1]:
def detect_discrete_outliers(data, types, threshold):
    data.dropna(inplace = True)
    
    if types == "IQR":
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        mask1 = Q1 - 1.5 * IQR
        mask2 = Q3 + 1.5 * IQR
        outliers = (data < mask1) & (data > mask2)
    elif types == "Z":
        mean = data.mean()
        std = data.std()
        z_score = (data - mean)/std 
        outliers = abs(z_score) > threshold
    else:
        raise Warning("Only 2 types: IQR or Z")

    return outliers

def clean_outliers(data, types = "IQR", threshold = 3):
    '''
    This function will cleanse outliers only
    and leave missing values alone.
    '''
    df = data.copy()
    list_of_outliers = [False]*df.shape[0]
    for x in df:
        list_of_outliers |= detect_discrete_outliers(df[x], types, 3)
    list_of_outliers
    return df[-list_of_outliers]

In [2]:
# Create test data
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randn(100).reshape(20,5), index=range(5,25))

df.iloc[5,3] = np.nan
df.iloc[8,2] = np.nan
df.iloc[11,1] = np.nan

df.iloc[4,1] = 10
df.iloc[6,2] = 15
df.iloc[19,4] = 33

df

Unnamed: 0,0,1,2,3,4
5,-0.05732,-0.071192,0.440447,-1.812297,1.069912
6,0.378565,1.535975,-2.275993,-0.768625,-0.86903
7,-0.637205,1.369349,-0.797095,0.728448,0.238891
8,-1.331737,1.181629,-1.992896,-1.349648,-0.049244
9,-1.368368,10.0,1.849274,-0.169103,0.274129
10,0.484604,-1.194556,-0.497964,,0.971353
11,0.453539,1.530541,15.0,0.706543,0.077578
12,-1.24312,-1.355744,-0.392365,1.6472,0.59074
13,-0.129433,-0.905226,,0.034212,-0.49285
14,-0.67335,1.099022,0.993498,-0.514717,0.074182


In [3]:
clean_outliers(data=df, types = "IQR", threshold=3)

Unnamed: 0,0,1,2,3,4
5,-0.05732,-0.071192,0.440447,-1.812297,1.069912
6,0.378565,1.535975,-2.275993,-0.768625,-0.86903
7,-0.637205,1.369349,-0.797095,0.728448,0.238891
8,-1.331737,1.181629,-1.992896,-1.349648,-0.049244
9,-1.368368,10.0,1.849274,-0.169103,0.274129
10,0.484604,-1.194556,-0.497964,,0.971353
11,0.453539,1.530541,15.0,0.706543,0.077578
12,-1.24312,-1.355744,-0.392365,1.6472,0.59074
13,-0.129433,-0.905226,,0.034212,-0.49285
14,-0.67335,1.099022,0.993498,-0.514717,0.074182


In [4]:
clean_outliers(data=df, types = "Z", threshold=3)

Unnamed: 0,0,1,2,3,4
5,-0.05732,-0.071192,0.440447,-1.812297,1.069912
6,0.378565,1.535975,-2.275993,-0.768625,-0.86903
7,-0.637205,1.369349,-0.797095,0.728448,0.238891
8,-1.331737,1.181629,-1.992896,-1.349648,-0.049244
10,0.484604,-1.194556,-0.497964,,0.971353
12,-1.24312,-1.355744,-0.392365,1.6472,0.59074
13,-0.129433,-0.905226,,0.034212,-0.49285
14,-0.67335,1.099022,0.993498,-0.514717,0.074182
15,0.167817,-0.865028,-0.714394,-0.741785,-0.717694
16,0.761206,,-0.345373,-0.33903,1.08508


In [5]:
clean_outliers(data=df, types = "I", threshold=3)

Warning: Only 2 types: IQR or Z