In [1]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.







In [2]:
import pandas as pd
import numpy as np

# Simulate loading the sales_data.csv with a Monthly_Sales column
data = pd.DataFrame({
    'Monthly_Sales': [250, 270, 260, 265, 255, 245, 280, 1000, 275, 260, 250, 240, 235, 230, 220, 210, 3000]
})

print("Original data shape:", data.shape)
print("Original data summary:")
print(data['Monthly_Sales'].describe())

# Step 2: Calculate Q1, Q3, and IQR
Q1 = data['Monthly_Sales'].quantile(0.25)
Q3 = data['Monthly_Sales'].quantile(0.75)
IQR = Q3 - Q1

print(f"\nQ1: {Q1}")
print(f"Q3: {Q3}")
print(f"IQR: {IQR}")

# Step 3: Identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"\nLower bound for outliers: {lower_bound}")
print(f"Upper bound for outliers: {upper_bound}")

outliers = data[(data['Monthly_Sales'] < lower_bound) | (data['Monthly_Sales'] > upper_bound)]
print(f"\nOutliers detected:\n{outliers}")

# Step 4: Remove outliers
cleaned_data = data[(data['Monthly_Sales'] >= lower_bound) & (data['Monthly_Sales'] <= upper_bound)]

# Step 5: Verify removal
print("\nCleaned data shape:", cleaned_data.shape)
print(f"Number of rows removed: {data.shape[0] - cleaned_data.shape[0]}")

print("\nCleaned data summary:")
print(cleaned_data['Monthly_Sales'].describe())


Original data shape: (17, 1)
Original data summary:
count      17.000000
mean      455.588235
std       680.629034
min       210.000000
25%       240.000000
50%       255.000000
75%       270.000000
max      3000.000000
Name: Monthly_Sales, dtype: float64

Q1: 240.0
Q3: 270.0
IQR: 30.0

Lower bound for outliers: 195.0
Upper bound for outliers: 315.0

Outliers detected:
    Monthly_Sales
7            1000
16           3000

Cleaned data shape: (15, 1)
Number of rows removed: 2

Cleaned data summary:
count     15.000000
mean     249.666667
std       20.041623
min      210.000000
25%      237.500000
50%      250.000000
75%      262.500000
max      280.000000
Name: Monthly_Sales, dtype: float64
