## Handling Outliers

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
df=pd.DataFrame({"Age":[1,5,8,9,6,3,2,8,9,8,10,18,19,50,25,66,98]})

In [7]:
mean=np.mean(df['Age'])
mean

20.294117647058822

In [8]:
sd=np.std(df['Age'])
sd

25.779072008767457

z-score Method:

In [10]:
df['Z-Score'] = (df['Age'] - mean) /sd

# Step 5: Print the data
print("----------------------------------------")
print(f"Here is the data with outliers:\n {df}")
print("----------------------------------------")
# Step 6: Print the outliers
print(f"Here are the outliers based on the z-score threshold, 3:\n {df[df['Z-Score'] > 3]}")
print("----------------------------------------")
# Step 7: Remove the outliers
data = df[df['Z-Score'] <= 3]

# Step 8: Print the data without outliers
print(f"Here is the data without outliers:\n {df}")

----------------------------------------
Here is the data with outliers:
     Age   Z-Score
0     1 -0.748441
1     5 -0.593277
2     8 -0.476903
3     9 -0.438112
4     6 -0.554485
5     3 -0.670859
6     2 -0.709650
7     8 -0.476903
8     9 -0.438112
9     8 -0.476903
10   10 -0.399321
11   18 -0.088991
12   19 -0.050200
13   50  1.152326
14   25  0.182547
15   66  1.772984
16   98  3.014301
----------------------------------------
Here are the outliers based on the z-score threshold, 3:
     Age   Z-Score
16   98  3.014301
----------------------------------------
Here is the data without outliers:
     Age   Z-Score
0     1 -0.748441
1     5 -0.593277
2     8 -0.476903
3     9 -0.438112
4     6 -0.554485
5     3 -0.670859
6     2 -0.709650
7     8 -0.476903
8     9 -0.438112
9     8 -0.476903
10   10 -0.399321
11   18 -0.088991
12   19 -0.050200
13   50  1.152326
14   25  0.182547
15   66  1.772984
16   98  3.014301


scipy library using stats

In [12]:
# Import libraries
import numpy as np
from scipy import stats

# Sample data
data = [2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 110.0]

# Calculate the Z-score for each data point
z_scores = np.abs(stats.zscore(data))

# Set a threshold for identifying outliers
threshold = 2.5 
outliers = np.where(z_scores > threshold)[0]

# print the data
print("----------------------------------------")
print("Data:", data)
print("----------------------------------------")

print("Indices of Outliers:", outliers)
print("Outliers:", [data[i] for i in outliers])

# Remove outliers
data = [data[i] for i in range(len(data)) if i not in outliers]
print("----------------------------------------")
print("Data without outliers:", data)

----------------------------------------
Data: [2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 110.0]
----------------------------------------
Indices of Outliers: [9]
Outliers: [110.0]
----------------------------------------
Data without outliers: [2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0]


IQR Method:

In [13]:
# Step 1: Import the required libraries
import pandas as pd
import numpy as np

# Step 2: Create the data
data = pd.DataFrame({'Age': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 50]})

# Step 3: Calculate the first and third quartile
Q1 = np.percentile(data['Age'], 25, interpolation = 'midpoint')
Q3 = np.percentile(data['Age'], 75, interpolation = 'midpoint')

# Step 4: Calculate the IQR
IQR = Q3 - Q1

# Step 5: Calculate the lower and upper bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Step 6: Print the data
print("----------------------------------------")
print(f"Here is the data with outliers:\n {data}")
print("----------------------------------------")
# Step 7: Print the outliers
print(f"Here are the outliers based on the IQR threshold:\n {data[(data['Age'] < lower_bound) | (data['Age'] > upper_bound)]}")
print("----------------------------------------")
# Step 8: Remove the outliers
data = data[(data['Age'] >= lower_bound) & (data['Age'] <= upper_bound)]

# Step 9: Print the data without outliers
print(f"Here is the data without outliers:\n {data}")

----------------------------------------
Here is the data with outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30
11   50
----------------------------------------
Here are the outliers based on the IQR threshold:
     Age
11   50
----------------------------------------
Here is the data without outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30
