In [1]:
import numpy as np
import pandas as pd

In [2]:
sampleData = pd.DataFrame([[1000],[2000],[3000],[4000],[5000],[6000],[7000],[8000],[9000],[200000]], columns=['Sal'])

In [3]:
sampleData

Unnamed: 0,Sal
0,1000
1,2000
2,3000
3,4000
4,5000
5,6000
6,7000
7,8000
8,9000
9,200000


In [4]:
#Technique to judge whether the column has any outlier or not (not foolproof !!!)
sampleData.describe()
#If mean is close enough to 50% percentile then there is no outlier. The more the distance the more the chance of outlier 
#in the dataset

Unnamed: 0,Sal
count,10.0
mean,24500.0
std,61718.446729
min,1000.0
25%,3250.0
50%,5500.0
75%,7750.0
max,200000.0


In [5]:
#Statistics suggest use any of the following methods
# 1. Use z-score (Normalization)
# 2. Use IQR ( Best Recommended Method if the domain can't help eliminating Outlier )
#IQR (Inter Quartile Range) = Q3 - Q1
#acceptable lower range for given column = Q1 - (1.5 * IQR)
#acceptable upper range for given column = Q3 + (1.5 * IQR)


In [6]:
#Outlier Detection Algo
# 1. Ensure your column data is sorted in ascending order
# 2. Get Q3 and Q1 and calculate IQR
# 3. Calculate lower_range
# 4. Calculate upper_range
#Outlier Elimination
# 5. Use 3&4 values to eliminate Outliers from the data

In [9]:
def outlierDetection(dataColumn):
    #Ensure your column data is sorted in ascending order
    sorted(dataColumn)
    #Get Q3 and Q1
    Q1,Q3 = np.percentile(dataColumn , [25,75])
    #Calc IQR
    IQR = Q3 - Q1
    #Calc lower range
    lowerRange = Q1 - (1.5 * IQR)
    #Calc upper range
    upperRange = Q3 + (1.5 * IQR)
    return lowerRange,upperRange

In [11]:
lower,upper = outlierDetection(sampleData.Sal)

In [14]:
#Eliminate Outlier
sampleData.drop( sampleData[sampleData.Sal > upper].index , inplace= True)

In [15]:
sampleData

Unnamed: 0,Sal
0,1000
1,2000
2,3000
3,4000
4,5000
5,6000
6,7000
7,8000
8,9000
