# Finding Outliers

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### Some criterias to identify Outliers
1. Data point that falls outside of 1.5 times of an IQR avobe the 3rd Quartile and below the 1st Quartile.
2. Data point that falls outside of 3 standard deviations, we can use a z-score and if the z-score falls outside of 2 standard deviation

### Various ways to find outliers
1. Using Scatter plots
2. Box plot
3. using z-score
4. using the IQR

In [None]:
dataset = [12, 15, 16, 14, 13, 21, 18, 17, 16, 23, 120, 22, 19, 14,
           10, 8, 25, 18, 110, 30, 9, 24, 11, 19, 16, 27, 105, 26, 15, 33]

### Detecting using z-score
z = (x – μ) / σ

In [13]:
def detect_outliers(data):
    outliers = []
    z_scores = []
    threshold = 3
    mean = np.mean(data)
    sd = np.std(data)

    for i in data:
        z_score = (i-mean)/sd
        z_scores.append(int(z_score))
        if np.abs(z_score) > threshold:
            outliers.append(i)
    print(z_scores)
    return outliers

In [14]:
outliers_points = detect_outliers(dataset)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]


In [12]:
outliers_points

[120]

### IQR
75% - 25% values in the dataset

#### Steps
1. Arrange the data in ascending order
2. Calculate first (q1) and third (q3) quartile
3. Find IQR (q3 - q1)
4. Find lower bound: lower = q1 - (iqr* 1.5)
5. Find upper bound: upper = q3 + (iqr* 1.5)

Anything lying outside the lower and upper bound is an outlier. 

In [17]:
data = sorted(dataset)

In [None]:
q1, q3 = np.percentile(data, [25, 75])
print(q1, q3)

14.25 24.75


In [None]:
iqr = q3-q1
iqr

np.float64(10.5)

In [None]:
lower = q1 - (iqr * 1.5)
upper = q3 + (iqr * 1.5)

In [None]:
outliers_list = []

for i in data:
    if i < lower or i > upper:
        outliers_list.append(i)
print(outliers_list, end="")

[105, 110, 120]