In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

✅ Libraries imported successfully!
Pandas version: 2.1.4
NumPy version: 1.26.2


In [2]:
# Define data path
raw_data_path = Path('../data/raw')

# List all CSV files
csv_files = list(raw_data_path.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files:\n")
for file in csv_files:
    print(f"  - {file.name}")

Found 9 CSV files:

  - olist_sellers_dataset.csv
  - product_category_name_translation.csv
  - olist_orders_dataset.csv
  - olist_order_items_dataset.csv
  - olist_customers_dataset.csv
  - olist_geolocation_dataset.csv
  - olist_order_payments_dataset.csv
  - olist_order_reviews_dataset.csv
  - olist_products_dataset.csv


In [3]:
# Load all datasets
print("Loading datasets...\n")

# Orders
orders = pd.read_csv(raw_data_path / 'olist_orders_dataset.csv')
print(f"✅ Orders: {orders.shape[0]:,} rows × {orders.shape[1]} columns")

# Order Items
order_items = pd.read_csv(raw_data_path / 'olist_order_items_dataset.csv')
print(f"✅ Order Items: {order_items.shape[0]:,} rows × {order_items.shape[1]} columns")

# Products
products = pd.read_csv(raw_data_path / 'olist_products_dataset.csv')
print(f"✅ Products: {products.shape[0]:,} rows × {products.shape[1]} columns")

# Customers
customers = pd.read_csv(raw_data_path / 'olist_customers_dataset.csv')
print(f"✅ Customers: {customers.shape[0]:,} rows × {customers.shape[1]} columns")

# Sellers
sellers = pd.read_csv(raw_data_path / 'olist_sellers_dataset.csv')
print(f"✅ Sellers: {sellers.shape[0]:,} rows × {sellers.shape[1]} columns")

# Payments
payments = pd.read_csv(raw_data_path / 'olist_order_payments_dataset.csv')
print(f"✅ Payments: {payments.shape[0]:,} rows × {payments.shape[1]} columns")

# Reviews
reviews = pd.read_csv(raw_data_path / 'olist_order_reviews_dataset.csv')
print(f"✅ Reviews: {reviews.shape[0]:,} rows × {reviews.shape[1]} columns")

# Geolocation
geolocation = pd.read_csv(raw_data_path / 'olist_geolocation_dataset.csv')
print(f"✅ Geolocation: {geolocation.shape[0]:,} rows × {geolocation.shape[1]} columns")

# Category translation
category_translation = pd.read_csv(raw_data_path / 'product_category_name_translation.csv')
print(f"✅ Category Translation: {category_translation.shape[0]:,} rows × {category_translation.shape[1]} columns")

print(f"\n🎉 Total records across all datasets: {orders.shape[0] + order_items.shape[0] + products.shape[0] + customers.shape[0] + sellers.shape[0] + payments.shape[0] + reviews.shape[0] + geolocation.shape[0]:,}")

Loading datasets...

✅ Orders: 99,441 rows × 8 columns
✅ Order Items: 112,650 rows × 7 columns
✅ Products: 32,951 rows × 9 columns
✅ Customers: 99,441 rows × 5 columns
✅ Sellers: 3,095 rows × 4 columns
✅ Payments: 103,886 rows × 5 columns
✅ Reviews: 99,224 rows × 7 columns
✅ Geolocation: 1,000,163 rows × 5 columns
✅ Category Translation: 71 rows × 2 columns

🎉 Total records across all datasets: 1,550,851


In [4]:
# Detailed look at the main Orders dataset
print("=" * 80)
print("ORDERS DATASET - DETAILED INSPECTION")
print("=" * 80)

print("\n📊 Column names and types:")
print(orders.dtypes)

print("\n📋 First 3 rows:")
display(orders.head(3))

print("\n❓ Missing values:")
missing = orders.isnull().sum()
print(missing[missing > 0])

print("\n📈 Basic statistics:")
print(orders.describe())

ORDERS DATASET - DETAILED INSPECTION

📊 Column names and types:
order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

📋 First 3 rows:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00



❓ Missing values:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
dtype: int64

📈 Basic statistics:
                                order_id                       customer_id  \
count                              99441                             99441   
unique                             99441                             99441   
top     e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
freq                                   1                                 1   

       order_status order_purchase_timestamp    order_approved_at  \
count         99441                    99441                99281   
unique            8                    98875                90733   
top       delivered      2018-04-11 10:48:14  2018-02-27 04:31:10   
freq          96478                        3                    9   

       order_delivered_carrier_date order_delivered_customer_date  \
count                      

In [6]:
# ============================================================================
# DATE RANGE ANALYSIS
# ============================================================================

print("📅 DATE RANGE ANALYSIS")
print("=" * 80)

# Convert date columns to datetime
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
orders['order_approved_at'] = pd.to_datetime(orders['order_approved_at'])
orders['order_delivered_customer_date'] = pd.to_datetime(orders['order_delivered_customer_date'])
orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date'])

# Find date range
min_date = orders['order_purchase_timestamp'].min()
max_date = orders['order_purchase_timestamp'].max()
date_range_days = (max_date - min_date).days

print(f"\n📊 Data covers from {min_date.date()} to {max_date.date()}")
print(f"📊 Total duration: {date_range_days} days ({date_range_days/365:.1f} years)")

# Orders by year-month
orders['year_month'] = orders['order_purchase_timestamp'].dt.to_period('M')
orders_by_month = orders.groupby('year_month').size().reset_index(name='order_count')

print(f"\n📈 Orders per month:")
print(orders_by_month.to_string(index=False))

# Peak month
peak_month = orders_by_month.loc[orders_by_month['order_count'].idxmax()]
print(f"\n🔥 Peak month: {peak_month['year_month']} with {peak_month['order_count']:,} orders")

📅 DATE RANGE ANALYSIS

📊 Data covers from 2016-09-04 to 2018-10-17
📊 Total duration: 772 days (2.1 years)

📈 Orders per month:
year_month  order_count
   2016-09            4
   2016-10          324
   2016-12            1
   2017-01          800
   2017-02         1780
   2017-03         2682
   2017-04         2404
   2017-05         3700
   2017-06         3245
   2017-07         4026
   2017-08         4331
   2017-09         4285
   2017-10         4631
   2017-11         7544
   2017-12         5673
   2018-01         7269
   2018-02         6728
   2018-03         7211
   2018-04         6939
   2018-05         6873
   2018-06         6167
   2018-07         6292
   2018-08         6512
   2018-09           16
   2018-10            4

🔥 Peak month: 2017-11 with 7,544 orders


In [7]:
# ============================================================================
# ORDER STATUS ANALYSIS
# ============================================================================

print("\n📦 ORDER STATUS ANALYSIS")
print("=" * 80)

# Order status distribution
status_counts = orders['order_status'].value_counts()
status_pct = orders['order_status'].value_counts(normalize=True) * 100

print("\n📊 Order Status Distribution:")
for status, count in status_counts.items():
    pct = status_pct[status]
    print(f"  {status:20s}: {count:6,} ({pct:5.2f}%)")

# Calculate delivery performance
delivered_orders = orders[orders['order_status'] == 'delivered'].copy()
delivered_orders['delivery_time'] = (
    delivered_orders['order_delivered_customer_date'] - 
    delivered_orders['order_purchase_timestamp']
).dt.days

print(f"\n🚚 Delivery Performance (for {len(delivered_orders):,} delivered orders):")
print(f"  Average delivery time: {delivered_orders['delivery_time'].mean():.1f} days")
print(f"  Median delivery time:  {delivered_orders['delivery_time'].median():.1f} days")
print(f"  Min delivery time:     {delivered_orders['delivery_time'].min():.1f} days")
print(f"  Max delivery time:     {delivered_orders['delivery_time'].max():.1f} days")


📦 ORDER STATUS ANALYSIS

📊 Order Status Distribution:
  delivered           : 96,478 (97.02%)
  shipped             :  1,107 ( 1.11%)
  canceled            :    625 ( 0.63%)
  unavailable         :    609 ( 0.61%)
  invoiced            :    314 ( 0.32%)
  processing          :    301 ( 0.30%)
  created             :      5 ( 0.01%)
  approved            :      2 ( 0.00%)

🚚 Delivery Performance (for 96,478 delivered orders):
  Average delivery time: 12.1 days
  Median delivery time:  10.0 days
  Min delivery time:     0.0 days
  Max delivery time:     209.0 days


In [8]:
# ============================================================================
# PRODUCT CATEGORIES ANALYSIS
# ============================================================================

print("\n🏷️ PRODUCT CATEGORIES ANALYSIS")
print("=" * 80)

# Merge products with category translation
products_with_category = products.merge(
    category_translation,
    on='product_category_name',
    how='left'
)

# Top categories by product count
top_categories = products_with_category['product_category_name_english'].value_counts().head(15)

print(f"\n📊 Top 15 Product Categories by Product Count:")
for idx, (category, count) in enumerate(top_categories.items(), 1):
    print(f"  {idx:2d}. {str(category):30s}: {count:4,} products")

# Products without category
no_category = products_with_category['product_category_name_english'].isna().sum()
print(f"\n⚠️  Products without category translation: {no_category:,}")


🏷️ PRODUCT CATEGORIES ANALYSIS

📊 Top 15 Product Categories by Product Count:
   1. bed_bath_table                : 3,029 products
   2. sports_leisure                : 2,867 products
   3. furniture_decor               : 2,657 products
   4. health_beauty                 : 2,444 products
   5. housewares                    : 2,335 products
   6. auto                          : 1,900 products
   7. computers_accessories         : 1,639 products
   8. toys                          : 1,411 products
   9. watches_gifts                 : 1,329 products
  10. telephony                     : 1,134 products
  11. baby                          :  919 products
  12. perfumery                     :  868 products
  13. fashion_bags_accessories      :  849 products
  14. stationery                    :  849 products
  15. cool_stuff                    :  789 products

⚠️  Products without category translation: 623


In [9]:
# ============================================================================
# SALES & REVENUE ANALYSIS
# ============================================================================

print("\n💰 SALES & REVENUE ANALYSIS")
print("=" * 80)

# Merge order items with orders to get dates
order_items_with_dates = order_items.merge(
    orders[['order_id', 'order_purchase_timestamp', 'order_status']],
    on='order_id',
    how='left'
)

# Calculate total revenue
total_revenue = order_items_with_dates['price'].sum()
total_freight = order_items_with_dates['freight_value'].sum()
total_items = len(order_items_with_dates)

print(f"\n📈 Overall Metrics:")
print(f"  Total Revenue:     R$ {total_revenue:,.2f}")
print(f"  Total Freight:     R$ {total_freight:,.2f}")
print(f"  Combined Total:    R$ {total_revenue + total_freight:,.2f}")
print(f"  Total Items Sold:  {total_items:,}")
print(f"  Average Item Price: R$ {total_revenue / total_items:.2f}")

# Average Order Value (AOV)
revenue_by_order = order_items_with_dates.groupby('order_id')['price'].sum()
aov = revenue_by_order.mean()
median_order = revenue_by_order.median()

print(f"\n🛒 Order Value Metrics:")
print(f"  Average Order Value (AOV): R$ {aov:.2f}")
print(f"  Median Order Value:        R$ {median_order:.2f}")
print(f"  Min Order Value:           R$ {revenue_by_order.min():.2f}")
print(f"  Max Order Value:           R$ {revenue_by_order.max():.2f}")

# Items per order
items_per_order = order_items_with_dates.groupby('order_id').size()
print(f"\n📦 Items per Order:")
print(f"  Average items per order: {items_per_order.mean():.2f}")
print(f"  Median items per order:  {items_per_order.median():.0f}")
print(f"  Max items in one order:  {items_per_order.max()}")


💰 SALES & REVENUE ANALYSIS

📈 Overall Metrics:
  Total Revenue:     R$ 13,591,643.70
  Total Freight:     R$ 2,251,909.54
  Combined Total:    R$ 15,843,553.24
  Total Items Sold:  112,650
  Average Item Price: R$ 120.65

🛒 Order Value Metrics:
  Average Order Value (AOV): R$ 137.75
  Median Order Value:        R$ 86.90
  Min Order Value:           R$ 0.85
  Max Order Value:           R$ 13440.00

📦 Items per Order:
  Average items per order: 1.14
  Median items per order:  1
  Max items in one order:  21


In [10]:
# ============================================================================
# PAYMENT ANALYSIS
# ============================================================================

print("\n💳 PAYMENT ANALYSIS")
print("=" * 80)

# Payment type distribution
payment_types = payments['payment_type'].value_counts()
payment_pct = payments['payment_type'].value_counts(normalize=True) * 100

print("\n📊 Payment Methods Distribution:")
for ptype, count in payment_types.items():
    pct = payment_pct[ptype]
    avg_value = payments[payments['payment_type'] == ptype]['payment_value'].mean()
    print(f"  {ptype:15s}: {count:6,} ({pct:5.2f}%) - Avg: R$ {avg_value:,.2f}")

# Installments analysis
installment_dist = payments['payment_installments'].value_counts().sort_index()
print(f"\n📊 Payment Installments (Top 10):")
for installments, count in installment_dist.head(10).items():
    pct = (count / len(payments)) * 100
    print(f"  {installments:2d} installments: {count:6,} ({pct:5.2f}%)")


💳 PAYMENT ANALYSIS

📊 Payment Methods Distribution:
  credit_card    : 76,795 (73.92%) - Avg: R$ 163.32
  boleto         : 19,784 (19.04%) - Avg: R$ 145.03
  voucher        :  5,775 ( 5.56%) - Avg: R$ 65.70
  debit_card     :  1,529 ( 1.47%) - Avg: R$ 142.57
  not_defined    :      3 ( 0.00%) - Avg: R$ 0.00

📊 Payment Installments (Top 10):
   0 installments:      2 ( 0.00%)
   1 installments: 52,546 (50.58%)
   2 installments: 12,413 (11.95%)
   3 installments: 10,461 (10.07%)
   4 installments:  7,098 ( 6.83%)
   5 installments:  5,239 ( 5.04%)
   6 installments:  3,920 ( 3.77%)
   7 installments:  1,626 ( 1.57%)
   8 installments:  4,268 ( 4.11%)
   9 installments:    644 ( 0.62%)


In [11]:
# ============================================================================
# GEOGRAPHY ANALYSIS
# ============================================================================

print("\n🗺️ GEOGRAPHY ANALYSIS")
print("=" * 80)

# Top customer cities
top_customer_cities = customers['customer_city'].value_counts().head(10)
print("\n📍 Top 10 Customer Cities:")
for idx, (city, count) in enumerate(top_customer_cities.items(), 1):
    pct = (count / len(customers)) * 100
    print(f"  {idx:2d}. {city:25s}: {count:5,} ({pct:4.2f}%)")

# Top customer states
top_customer_states = customers['customer_state'].value_counts().head(10)
print("\n📍 Top 10 Customer States:")
for idx, (state, count) in enumerate(top_customer_states.items(), 1):
    pct = (count / len(customers)) * 100
    print(f"  {idx:2d}. {state:5s}: {count:5,} ({pct:4.2f}%)")

# Sellers distribution
top_seller_states = sellers['seller_state'].value_counts().head(10)
print("\n📍 Top 10 Seller States:")
for idx, (state, count) in enumerate(top_seller_states.items(), 1):
    pct = (count / len(sellers)) * 100
    print(f"  {idx:2d}. {state:5s}: {count:5,} ({pct:4.2f}%)")


🗺️ GEOGRAPHY ANALYSIS

📍 Top 10 Customer Cities:
   1. sao paulo                : 15,540 (15.63%)
   2. rio de janeiro           : 6,882 (6.92%)
   3. belo horizonte           : 2,773 (2.79%)
   4. brasilia                 : 2,131 (2.14%)
   5. curitiba                 : 1,521 (1.53%)
   6. campinas                 : 1,444 (1.45%)
   7. porto alegre             : 1,379 (1.39%)
   8. salvador                 : 1,245 (1.25%)
   9. guarulhos                : 1,189 (1.20%)
  10. sao bernardo do campo    :   938 (0.94%)

📍 Top 10 Customer States:
   1. SP   : 41,746 (41.98%)
   2. RJ   : 12,852 (12.92%)
   3. MG   : 11,635 (11.70%)
   4. RS   : 5,466 (5.50%)
   5. PR   : 5,045 (5.07%)
   6. SC   : 3,637 (3.66%)
   7. BA   : 3,380 (3.40%)
   8. DF   : 2,140 (2.15%)
   9. ES   : 2,033 (2.04%)
  10. GO   : 2,020 (2.03%)

📍 Top 10 Seller States:
   1. SP   : 1,849 (59.74%)
   2. PR   :   349 (11.28%)
   3. MG   :   244 (7.88%)
   4. SC   :   190 (6.14%)
   5. RJ   :   171 (5.53%)
   6. RS   : 

In [12]:
# ============================================================================
# CUSTOMER REVIEW ANALYSIS
# ============================================================================

print("\n⭐ CUSTOMER REVIEW ANALYSIS")
print("=" * 80)

# Review score distribution
review_scores = reviews['review_score'].value_counts().sort_index()
print("\n📊 Review Score Distribution:")
for score, count in review_scores.items():
    pct = (count / len(reviews)) * 100
    stars = "⭐" * int(score)
    print(f"  {stars:10s} ({score}): {count:6,} ({pct:5.2f}%)")

avg_score = reviews['review_score'].mean()
print(f"\n📊 Average Review Score: {avg_score:.2f} / 5.0")

# Reviews with comments
reviews_with_comments = reviews['review_comment_message'].notna().sum()
comment_rate = (reviews_with_comments / len(reviews)) * 100
print(f"\n💬 Reviews with comments: {reviews_with_comments:,} ({comment_rate:.2f}%)")


⭐ CUSTOMER REVIEW ANALYSIS

📊 Review Score Distribution:
  ⭐          (1): 11,424 (11.51%)
  ⭐⭐         (2):  3,151 ( 3.18%)
  ⭐⭐⭐        (3):  8,179 ( 8.24%)
  ⭐⭐⭐⭐       (4): 19,142 (19.29%)
  ⭐⭐⭐⭐⭐      (5): 57,328 (57.78%)

📊 Average Review Score: 4.09 / 5.0

💬 Reviews with comments: 40,977 (41.30%)


In [13]:
# ============================================================================
# DATA RELATIONSHIPS & QUALITY CHECK
# ============================================================================

print("\n🔗 DATA RELATIONSHIPS & QUALITY")
print("=" * 80)

print("\n📊 Key Relationships:")
print(f"  Unique Orders:        {orders['order_id'].nunique():,}")
print(f"  Unique Customers:     {orders['customer_id'].nunique():,}")
print(f"  Unique Products:      {order_items['product_id'].nunique():,}")
print(f"  Unique Sellers:       {order_items['seller_id'].nunique():,}")

print("\n📊 Customer Behavior:")
orders_per_customer = orders.groupby('customer_id').size()
print(f"  Customers with 1 order:    {(orders_per_customer == 1).sum():,} ({(orders_per_customer == 1).sum() / len(orders_per_customer) * 100:.2f}%)")
print(f"  Customers with 2+ orders:  {(orders_per_customer >= 2).sum():,} ({(orders_per_customer >= 2).sum() / len(orders_per_customer) * 100:.2f}%)")
print(f"  Max orders by one customer: {orders_per_customer.max()}")

print("\n✅ Data Quality Summary:")
print(f"  Orders with missing approved date:       {orders['order_approved_at'].isna().sum():,}")
print(f"  Orders with missing delivery date:       {orders['order_delivered_customer_date'].isna().sum():,}")
print(f"  Products without category:                {products['product_category_name'].isna().sum():,}")
print(f"  Orders without reviews:                   {len(orders) - len(reviews):,}")

print("\n🎯 DATASET IS READY FOR PIPELINE DEVELOPMENT!")


🔗 DATA RELATIONSHIPS & QUALITY

📊 Key Relationships:
  Unique Orders:        99,441
  Unique Customers:     99,441
  Unique Products:      32,951
  Unique Sellers:       3,095

📊 Customer Behavior:
  Customers with 1 order:    99,441 (100.00%)
  Customers with 2+ orders:  0 (0.00%)
  Max orders by one customer: 1

✅ Data Quality Summary:
  Orders with missing approved date:       160
  Orders with missing delivery date:       2,965
  Products without category:                610
  Orders without reviews:                   217

🎯 DATASET IS READY FOR PIPELINE DEVELOPMENT!


In [14]:
# ============================================================================
# KEY TAKEAWAYS FOR PIPELINE DEVELOPMENT
# ============================================================================

print("\n" + "="*80)
print("🎯 KEY TAKEAWAYS FOR PIPELINE DEVELOPMENT")
print("="*80)

takeaways = """
1. DATA VOLUME: 1.5M+ records across 9 tables - substantial dataset
2. TIME RANGE: ~2 years of data (2016-2018) - good for time series analysis
3. DATA QUALITY: 97% completion rate, minor missing values manageable
4. STAR SCHEMA READY: Clear fact (orders) and dimension tables (customers, products, sellers)
5. CUSTOMER RETENTION: 0% repeat rate - major business problem to analyze!
6. GEOGRAPHY: Highly concentrated in São Paulo - geographic analysis potential
7. REVIEWS: Rich sentiment data with 41% having text comments - NLP opportunity
8. PAYMENTS: Diverse methods and installment plans - financial analysis ready
9. DELIVERY: Average 12 days - operational metrics to track
10. CATEGORIES: 71 product categories - good for segmentation

NEXT STEPS:
✅ Data exploration complete
➡️ Set up PostgreSQL database (Docker)
➡️ Design star schema for data warehouse
➡️ Build ETL pipeline to load data
➡️ Set up real-time streaming simulation
"""

print(takeaways)


🎯 KEY TAKEAWAYS FOR PIPELINE DEVELOPMENT

1. DATA VOLUME: 1.5M+ records across 9 tables - substantial dataset
2. TIME RANGE: ~2 years of data (2016-2018) - good for time series analysis
3. DATA QUALITY: 97% completion rate, minor missing values manageable
4. STAR SCHEMA READY: Clear fact (orders) and dimension tables (customers, products, sellers)
5. CUSTOMER RETENTION: 0% repeat rate - major business problem to analyze!
6. GEOGRAPHY: Highly concentrated in São Paulo - geographic analysis potential
7. REVIEWS: Rich sentiment data with 41% having text comments - NLP opportunity
8. PAYMENTS: Diverse methods and installment plans - financial analysis ready
9. DELIVERY: Average 12 days - operational metrics to track
10. CATEGORIES: 71 product categories - good for segmentation

NEXT STEPS:
✅ Data exploration complete
➡️ Set up PostgreSQL database (Docker)
➡️ Design star schema for data warehouse
➡️ Build ETL pipeline to load data
➡️ Set up real-time streaming simulation

