# 01. Data Overview and Cleaning

This notebook performs initial data exploration, cleaning, and transformation of the Global Superstore dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import json
from datetime import datetime

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading and Initial Discovery

In [2]:
# Load the dataset
df = pd.read_csv('../data/raw_data.csv', encoding='latin-1')

print(f"Dataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")

Dataset Shape: (51290, 24)
Rows: 51,290
Columns: 24


In [3]:
# Display first 5 rows
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,32298,CA-2012-124891,31-07-2012,31-07-2012,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,TEC-AC-10003033,Technology,Accessories,Plantronics CS510 - Over-the-Head monaural Wir...,2309.65,7,0.0,762.1845,933.57,Critical
1,26341,IN-2013-77878,05-02-2013,07-02-2013,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,...,FUR-CH-10003950,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical
2,25330,IN-2013-71249,17-10-2013,18-10-2013,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,TEC-PH-10004664,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium
3,13524,ES-2013-1579342,28-01-2013,30-01-2013,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,...,TEC-PH-10004583,Technology,Phones,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium
4,47221,SG-2013-4320,05-11-2013,06-11-2013,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,TEC-SHA-10000501,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical


In [4]:
# Column information and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51290 entries, 0 to 51289
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Row ID          51290 non-null  int64  
 1   Order ID        51290 non-null  object 
 2   Order Date      51290 non-null  object 
 3   Ship Date       51290 non-null  object 
 4   Ship Mode       51290 non-null  object 
 5   Customer ID     51290 non-null  object 
 6   Customer Name   51290 non-null  object 
 7   Segment         51290 non-null  object 
 8   City            51290 non-null  object 
 9   State           51290 non-null  object 
 10  Country         51290 non-null  object 
 11  Postal Code     9994 non-null   float64
 12  Market          51290 non-null  object 
 13  Region          51290 non-null  object 
 14  Product ID      51290 non-null  object 
 15  Category        51290 non-null  object 
 16  Sub-Category    51290 non-null  object 
 17  Product Name    51290 non-null 

In [5]:
# Statistical summary
df.describe()

Unnamed: 0,Row ID,Postal Code,Sales,Quantity,Discount,Profit,Shipping Cost
count,51290.0,9994.0,51290.0,51290.0,51290.0,51290.0,51290.0
mean,25645.5,55190.379428,246.490581,3.476545,0.142908,28.610982,26.375915
std,14806.29199,32063.69335,487.565361,2.278766,0.21228,174.340972,57.296804
min,1.0,1040.0,0.444,1.0,0.0,-6599.978,0.0
25%,12823.25,23223.0,30.758625,2.0,0.0,0.0,2.61
50%,25645.5,56430.5,85.053,3.0,0.0,9.24,7.79
75%,38467.75,90008.0,251.0532,5.0,0.2,36.81,24.45
max,51290.0,99301.0,22638.48,14.0,0.85,8399.976,933.57


In [6]:
# Check for missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100
})

missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

Unnamed: 0,Column,Missing_Count,Missing_Percentage
Postal Code,Postal Code,41296,80.51472


In [7]:
# Unique counts for key categorical fields
categorical_columns = df.select_dtypes(include=['object']).columns

unique_counts = {}
for col in categorical_columns[:10]:  # First 10 categorical columns
    unique_counts[col] = df[col].nunique()

pd.DataFrame(unique_counts, index=['Unique Values']).T.sort_values('Unique Values', ascending=False)

Unnamed: 0,Unique Values
Order ID,25035
City,3636
Customer ID,1590
Ship Date,1464
Order Date,1430
State,1094
Customer Name,795
Country,147
Ship Mode,4
Segment,3


## 2. Data Cleaning and Transformation

In [8]:
# Create a copy for cleaning
df_clean = df.copy()

# Handle missing values
print(f"Missing values before cleaning: {df_clean.isnull().sum().sum()}")

# Drop rows with missing critical fields (if any)
critical_columns = ['Order ID', 'Customer ID', 'Product ID', 'Sales']
df_clean = df_clean.dropna(subset=[col for col in critical_columns if col in df_clean.columns])

# Fill missing postal codes with 'Unknown'
if 'Postal Code' in df_clean.columns:
    df_clean['Postal Code'] = df_clean['Postal Code'].fillna('Unknown')

print(f"Missing values after cleaning: {df_clean.isnull().sum().sum()}")
print(f"Rows after cleaning: {len(df_clean):,}")

Missing values before cleaning: 41296
Missing values after cleaning: 0
Rows after cleaning: 51,290


In [9]:
# Parse date columns
date_columns = ['Order Date', 'Ship Date']
for col in date_columns:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Create date components
if 'Order Date' in df_clean.columns:
    df_clean['order_year'] = df_clean['Order Date'].dt.year
    df_clean['order_month'] = df_clean['Order Date'].dt.month
    df_clean['order_quarter'] = df_clean['Order Date'].dt.quarter
    df_clean['order_week'] = df_clean['Order Date'].dt.isocalendar().week
    df_clean['order_day_of_week'] = df_clean['Order Date'].dt.dayofweek
    df_clean['order_day_name'] = df_clean['Order Date'].dt.day_name()

print("Date columns parsed successfully")

Date columns parsed successfully


In [10]:
# Convert numeric columns
numeric_columns = ['Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost']
for col in numeric_columns:
    if col in df_clean.columns:
        # Remove currency symbols if present
        if df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].astype(str).str.replace('$', '').str.replace(',', '')
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

print("Numeric columns converted successfully")

Numeric columns converted successfully


In [11]:
# Standardize categorical values
categorical_cols = ['Segment', 'Ship Mode', 'Region', 'Category', 'Sub-Category']
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].str.strip().str.title()

print("Categorical values standardized")

Categorical values standardized


In [12]:
# Add derived columns
if 'Sales' in df_clean.columns:
    df_clean['revenue'] = df_clean['Sales']

if 'Profit' in df_clean.columns and 'Sales' in df_clean.columns:
    df_clean['profit_margin'] = np.where(
        df_clean['Sales'] != 0,
        (df_clean['Profit'] / df_clean['Sales']) * 100,
        0
    )

if 'Ship Date' in df_clean.columns and 'Order Date' in df_clean.columns:
    df_clean['delivery_days'] = (df_clean['Ship Date'] - df_clean['Order Date']).dt.days

print("Derived columns added")

Derived columns added


In [13]:
# Check for duplicates
duplicates = df_clean.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

if duplicates > 0:
    df_clean = df_clean.drop_duplicates()
    print(f"Rows after removing duplicates: {len(df_clean):,}")

Duplicate rows: 0


## 3. Data Quality Checks

In [14]:
# Validation checks
checks = []

# Check 1: No null Order IDs
if 'Order ID' in df_clean.columns:
    null_orders = df_clean['Order ID'].isnull().sum()
    checks.append({'Check': 'No null Order IDs', 'Passed': null_orders == 0, 'Details': f'{null_orders} null values'})

# Check 2: Sales values are positive
if 'Sales' in df_clean.columns:
    negative_sales = (df_clean['Sales'] < 0).sum()
    checks.append({'Check': 'Positive sales values', 'Passed': negative_sales == 0, 'Details': f'{negative_sales} negative values'})

# Check 3: Dates are reasonable
if 'Order Date' in df_clean.columns:
    min_date = df_clean['Order Date'].min()
    max_date = df_clean['Order Date'].max()
    checks.append({'Check': 'Reasonable date range', 'Passed': True, 'Details': f'{min_date} to {max_date}'})

# Check 4: Numeric columns are numeric
for col in ['Sales', 'Quantity', 'Profit']:
    if col in df_clean.columns:
        is_numeric = pd.api.types.is_numeric_dtype(df_clean[col])
        checks.append({'Check': f'{col} is numeric', 'Passed': is_numeric, 'Details': str(df_clean[col].dtype)})

pd.DataFrame(checks)

Unnamed: 0,Check,Passed,Details
0,No null Order IDs,True,0 null values
1,Positive sales values,True,0 negative values
2,Reasonable date range,True,2011-01-01 00:00:00 to 2014-12-31 00:00:00
3,Sales is numeric,True,float64
4,Quantity is numeric,True,int64
5,Profit is numeric,True,float64


## 4. Export Cleaned Data

In [15]:
# Create separate dataframes for normalized tables

# Orders table
order_columns = ['Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 
                 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region',
                 'order_year', 'order_month', 'order_quarter', 'order_week', 'delivery_days']
orders_df = df_clean[order_columns].drop_duplicates(subset=['Order ID'])

# Order Items table
order_items_columns = ['Order ID', 'Product ID', 'Sales', 'Quantity', 'Discount', 
                       'Profit', 'Shipping Cost', 'profit_margin']
order_items_df = df_clean[order_items_columns]

# Customers table
customer_columns = ['Customer ID', 'Customer Name', 'Segment']
customers_df = df_clean[customer_columns].drop_duplicates(subset=['Customer ID'])

# Products table
product_columns = ['Product ID', 'Product Name', 'Category', 'Sub-Category']
products_df = df_clean[product_columns].drop_duplicates(subset=['Product ID'])

print(f"Orders: {len(orders_df):,} rows")
print(f"Order Items: {len(order_items_df):,} rows")
print(f"Customers: {len(customers_df):,} rows")
print(f"Products: {len(products_df):,} rows")

Orders: 25,035 rows
Order Items: 51,290 rows
Customers: 1,590 rows
Products: 10,292 rows


In [16]:
# Save cleaned data
df_clean.to_csv('../data/superstore_clean.csv', index=False)
orders_df.to_csv('../data/orders_clean.csv', index=False)
order_items_df.to_csv('../data/order_items_clean.csv', index=False)
customers_df.to_csv('../data/customers_clean.csv', index=False)
products_df.to_csv('../data/products_clean.csv', index=False)

print("Cleaned data exported successfully!")

Cleaned data exported successfully!


In [17]:
# Save data summary
summary = {
    'total_rows': len(df_clean),
    'total_columns': len(df_clean.columns),
    'date_range': {
        'start': str(df_clean['Order Date'].min()),
        'end': str(df_clean['Order Date'].max())
    },
    'unique_counts': {
        'orders': df_clean['Order ID'].nunique(),
        'customers': df_clean['Customer ID'].nunique(),
        'products': df_clean['Product ID'].nunique(),
        'categories': df_clean['Category'].nunique(),
        'regions': df_clean['Region'].nunique()
    },
    'metrics': {
        'total_revenue': float(df_clean['Sales'].sum()),
        'total_profit': float(df_clean['Profit'].sum()),
        'avg_order_value': float(df_clean.groupby('Order ID')['Sales'].sum().mean())
    }
}

with open('../data/data_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Data summary saved!")
print(json.dumps(summary, indent=2))

Data summary saved!
{
  "total_rows": 51290,
  "total_columns": 33,
  "date_range": {
    "start": "2011-01-01 00:00:00",
    "end": "2014-12-31 00:00:00"
  },
  "unique_counts": {
    "orders": 25035,
    "customers": 1590,
    "products": 10292,
    "categories": 3,
    "regions": 13
  },
  "metrics": {
    "total_revenue": 12642501.909880001,
    "total_profit": 1467457.29128,
    "avg_order_value": 504.99308607469555
  }
}
