In [1]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.



import pandas as pd

# Create a sample dataset with Monthly_Sales (including some outliers)
data = {
    "Monthly_Sales": [100, 150, 160, 170, 180, 190, 10000, 200, 210, 220, 230, 240, 250, 30000]
}
df = pd.DataFrame(data)

# Save to CSV (simulate the sales_data.csv)
df.to_csv("sales_data.csv", index=False)

# Now load the dataset from the CSV
df = pd.read_csv("sales_data.csv")
print("Original dataset shape:", df.shape)
print(df)

# Calculate Q1, Q3, and IQR
Q1 = df["Monthly_Sales"].quantile(0.25)
Q3 = df["Monthly_Sales"].quantile(0.75)
IQR = Q3 - Q1

print(f"\nQ1: {Q1}")
print(f"Q3: {Q3}")
print(f"IQR: {IQR}")

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower bound: {lower_bound}")
print(f"Upper bound: {upper_bound}")

# Identify outliers
outliers = df[(df["Monthly_Sales"] < lower_bound) | (df["Monthly_Sales"] > upper_bound)]
print(f"\nNumber of outliers detected: {outliers.shape[0]}")
print(outliers)

# Remove outliers
df_cleaned = df[(df["Monthly_Sales"] >= lower_bound) & (df["Monthly_Sales"] <= upper_bound)]

print("\nCleaned dataset shape:", df_cleaned.shape)
print(df_cleaned)
print("Number of rows removed:", df.shape[0] - df_cleaned.shape[0])

































Original dataset shape: (14, 1)
    Monthly_Sales
0             100
1             150
2             160
3             170
4             180
5             190
6           10000
7             200
8             210
9             220
10            230
11            240
12            250
13          30000

Q1: 172.5
Q3: 237.5
IQR: 65.0
Lower bound: 75.0
Upper bound: 335.0

Number of outliers detected: 2
    Monthly_Sales
6           10000
13          30000

Cleaned dataset shape: (12, 1)
    Monthly_Sales
0             100
1             150
2             160
3             170
4             180
5             190
7             200
8             210
9             220
10            230
11            240
12            250
Number of rows removed: 2
