In [1]:
# IQR: Interquartile Range Method

# Not all data is normal or normal enough to treat it as being drawn from a Gaussian distribution.
# A good statistic for summarizing a non-Gaussian distribution sample of data is the Interquartile Range, or IQR for short.
# The IQR is calculated as the difference between the 75th and the 25th percentiles of the data and defines the box in a box and whisker plot.


In [6]:
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std
import numpy as np

seed(1)

# generate univariate observations
data = 5 * randn(10000) + 50

# summarize
print('mean=%.3f stdv=%.3f' % (mean(data), std(data)))

len(data)

mean=50.049 stdv=4.994


10000

In [7]:
# calculate interquartile range
q25, q75 = np.percentile(data, 25), np.percentile(data, 75)
iqr = q75 - q25

In [None]:
# We can then calculate the cutoff for outliers as 1.5 times the IQR and subtract this cut-off from the 25th percentile 
# and add it to the 75th percentile to give the actual limits on the data.

In [11]:
# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

In [12]:
# identify outliers
outliers = [x for x in data if x < lower or x > upper]

In [13]:
outliers_removed = [x for x in data if x > lower and x < upper]

In [16]:
len(outliers_removed)

9919