In [None]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.







In [2]:
import pandas as pd
import numpy as np

# Step 1: Create a dataset and save it as sales_data.csv
np.random.seed(0)

# 100 normal sales values centered around 2000
monthly_sales_normal = np.random.normal(loc=2000, scale=300, size=100).round(2)

# Adding 5 outlier values
outliers = np.array([5000, 5200, 4800, 6000, 6500])

# Combine both
monthly_sales = np.concatenate([monthly_sales_normal, outliers])

# Create a DataFrame
df_original = pd.DataFrame({'Monthly_Sales': monthly_sales})

# Save to CSV
df_original.to_csv('sales_data.csv', index=False)

print("Original dataset created and saved as 'sales_data.csv'")
print(df_original.tail(10))  # Show last few rows

# Step 2: Load the dataset
df = pd.read_csv('sales_data.csv')

# Step 3: Calculate IQR
Q1 = df['Monthly_Sales'].quantile(0.25)
Q3 = df['Monthly_Sales'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 4: Identify and remove outliers
initial_rows = df.shape[0]
df_cleaned = df[(df['Monthly_Sales'] >= lower_bound) & (df['Monthly_Sales'] <= upper_bound)]
final_rows = df_cleaned.shape[0]

# Step 5: Display results
print(f"\nQ1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
print(f"Rows before removing outliers: {initial_rows}")
print(f"Rows after removing outliers: {final_rows}")
print(f"Total outliers removed: {initial_rows - final_rows}")



Original dataset created and saved as 'sales_data.csv'
     Monthly_Sales
95         2211.97
96         2003.15
97         2535.76
98         2038.07
99         2120.60
100        5000.00
101        5200.00
102        4800.00
103        6000.00
104        6500.00

Q1: 1826.35, Q3: 2270.25, IQR: 443.9000000000001
Lower Bound: 1160.4999999999998, Upper Bound: 2936.1000000000004
Rows before removing outliers: 105
Rows after removing outliers: 100
Total outliers removed: 5
