In [None]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.







In [3]:
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv('sales_data.csv')

# Step 2: Calculate Q1, Q3, and IQR
Q1 = df['Monthly_Sales'].quantile(0.25)
Q3 = df['Monthly_Sales'].quantile(0.75)
IQR = Q3 - Q1

# Step 3: Identify bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers (optional: for checking)
outliers = df[(df['Monthly_Sales'] < lower_bound) | (df['Monthly_Sales'] > upper_bound)]

# Step 4: Remove outliers
df_cleaned = df[(df['Monthly_Sales'] >= lower_bound) & (df['Monthly_Sales'] <= upper_bound)]

# Step 5: Verify the number of rows removed
original_rows = df.shape[0]
cleaned_rows = df_cleaned.shape[0]
rows_removed = original_rows - cleaned_rows

# Output results
print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
print(f"Original Rows: {original_rows}")
print(f"Cleaned Rows: {cleaned_rows}")
print(f"Rows Removed (Outliers): {rows_removed}")


Q1: 4975.0, Q3: 6850.0, IQR: 1875.0
Lower Bound: 2162.5, Upper Bound: 9662.5
Original Rows: 51
Cleaned Rows: 49
Rows Removed (Outliers): 2


In [2]:
import pandas as pd

data = {
    'Monthly_Sales': [
        4500, 4700, 4800, 5100, 5000, 5200, 5300, 4900, 4950, 5050,
        5100, 5150, 5250, 4900, 4850, 4950, 4800, 4700, 4600, 4750,
        47000, 4900, 5000, 5200, 5300, 5500, 5600, 5700, 100000, 5800,
        5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800,
        6900, 7000, 7100, 7200, 7300, 7500, 7600, 7700, 7800, 7900, 8000
    ]
}

df = pd.DataFrame(data)
df.to_csv('sales_data.csv', index=False)
