In [1]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [3]:
# Data Description
df = pd.read_csv('../data/raw.csv')

df.describe()

Unnamed: 0,Row ID,Postal Code,Sales
count,9800.0,9789.0,9800.0
mean,4900.5,55273.322403,230.769059
std,2829.160653,32041.223413,626.651875
min,1.0,1040.0,0.444
25%,2450.75,23223.0,17.248
50%,4900.5,58103.0,54.49
75%,7350.25,90008.0,210.605
max,9800.0,99301.0,22638.48


In [4]:
# Missing value analysis
df.isnull().sum()

df['Postal Code'].fillna('Unknown', inplace=True)

df.isnull().sum()

Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
dtype: int64

In [5]:
# Outlier detection using Z-score method (less aggressive for skewed data)
def detect_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
    outlier_mask = z_scores > threshold
    return outlier_mask

print(f"Original data size: {len(df)} rows")

outlier_mask = detect_outliers_zscore(df, 'Sales', threshold=3)
print(f"Number of outliers detected (Z-score > 3): {outlier_mask.sum()}")

df_cleaned = df[~outlier_mask].copy()
print(f"Data size after removing outliers: {len(df_cleaned)} rows")
print(f"Removed {len(df) - len(df_cleaned)} outlier rows")

df = df_cleaned

Original data size: 9800 rows
Number of outliers detected (Z-score > 3): 123
Data size after removing outliers: 9677 rows
Removed 123 outlier rows


In [6]:
# Save cleaned data to CSV
df.to_csv('../data/cleaned.csv', index=False)
print("Cleaned data saved to data/cleaned.csv")

Cleaned data saved to data/cleaned.csv
