In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Creating a dummy dataset with outliers
data = {
    'Feature1': [1, 2, 3, 100, 5, 6, 7, 8, 200, 10],
    'Feature2': [10, 20, 30, 40, 500, 60, 70, 80, 900, 100]
}

# Creating a DataFrame from the dummy dataset
df = pd.DataFrame(data)

# Displaying the original dataset
print("Original Dataset:")
df

Original Dataset:


Unnamed: 0,Feature1,Feature2
0,1,10
1,2,20
2,3,30
3,100,40
4,5,500
5,6,60
6,7,70
7,8,80
8,200,900
9,10,100


# Handling Outliers

In [2]:
# Technique 1: Trimming - Removing values beyond a certain threshold
threshold = 50
df['Feature1_trimmed'] = df['Feature1'].apply(lambda x: x if x <= threshold else np.nan)

df

Unnamed: 0,Feature1,Feature2,Feature1_trimmed
0,1,10,1.0
1,2,20,2.0
2,3,30,3.0
3,100,40,
4,5,500,5.0
5,6,60,6.0
6,7,70,7.0
7,8,80,8.0
8,200,900,
9,10,100,10.0


In [3]:
# Technique 2: Capping - Capping extreme values at a predefined upper limit
upper_limit = 50
df['Feature2_capped'] = df['Feature2'].apply(lambda x: upper_limit if x > upper_limit else x)

df

Unnamed: 0,Feature1,Feature2,Feature1_trimmed,Feature2_capped
0,1,10,1.0,10
1,2,20,2.0,20
2,3,30,3.0,30
3,100,40,,40
4,5,500,5.0,50
5,6,60,6.0,50
6,7,70,7.0,50
7,8,80,8.0,50
8,200,900,,50
9,10,100,10.0,50


In [4]:
# Technique 3: Z-Score - Identifying and removing data points with z-scores beyond a threshold
z_threshold = 2
df['Feature1_zscore'] = np.abs((df['Feature1'] - df['Feature1'].mean()) / df['Feature1'].std())
df['Feature1_zscore_filtered'] = df['Feature1'].where(df['Feature1_zscore'] <= z_threshold)

df

Unnamed: 0,Feature1,Feature2,Feature1_trimmed,Feature2_capped,Feature1_zscore,Feature1_zscore_filtered
0,1,10,1.0,10,0.507006,1.0
1,2,20,2.0,20,0.491735,2.0
2,3,30,3.0,30,0.476463,3.0
3,100,40,,40,1.004849,100.0
4,5,500,5.0,50,0.445921,5.0
5,6,60,6.0,50,0.43065,6.0
6,7,70,7.0,50,0.415378,7.0
7,8,80,8.0,50,0.400107,8.0
8,200,900,,50,2.531975,
9,10,100,10.0,50,0.369564,10.0


In [5]:
# Technique 4: IQR (Interquartile Range) - Removing values outside a specified range
q1 = df['Feature2'].quantile(0.25)
q3 = df['Feature2'].quantile(0.75)
iqr = q3 - q1
lower_limit = q1 - 1.5 * iqr
upper_limit_iqr = q3 + 1.5 * iqr
df['Feature2_iqr_filtered'] = df['Feature2'].where((df['Feature2'] >= lower_limit) & (df['Feature2'] <= upper_limit_iqr))

df

Unnamed: 0,Feature1,Feature2,Feature1_trimmed,Feature2_capped,Feature1_zscore,Feature1_zscore_filtered,Feature2_iqr_filtered
0,1,10,1.0,10,0.507006,1.0,10.0
1,2,20,2.0,20,0.491735,2.0,20.0
2,3,30,3.0,30,0.476463,3.0,30.0
3,100,40,,40,1.004849,100.0,40.0
4,5,500,5.0,50,0.445921,5.0,
5,6,60,6.0,50,0.43065,6.0,60.0
6,7,70,7.0,50,0.415378,7.0,70.0
7,8,80,8.0,50,0.400107,8.0,80.0
8,200,900,,50,2.531975,,
9,10,100,10.0,50,0.369564,10.0,100.0


In [7]:
# Displaying the updated dataset after handling outliers
print("\nUpdated Dataset after Handling Outliers:")
df


Updated Dataset after Handling Outliers:


Unnamed: 0,Feature1,Feature2,Feature1_trimmed,Feature2_capped,Feature1_zscore,Feature1_zscore_filtered,Feature2_iqr_filtered
0,1,10,1.0,10,0.507006,1.0,10.0
1,2,20,2.0,20,0.491735,2.0,20.0
2,3,30,3.0,30,0.476463,3.0,30.0
3,100,40,,40,1.004849,100.0,40.0
4,5,500,5.0,50,0.445921,5.0,
5,6,60,6.0,50,0.43065,6.0,60.0
6,7,70,7.0,50,0.415378,7.0,70.0
7,8,80,8.0,50,0.400107,8.0,80.0
8,200,900,,50,2.531975,,
9,10,100,10.0,50,0.369564,10.0,100.0
