In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

✅ Libraries imported successfully!
Pandas version: 2.1.4
NumPy version: 1.26.2


In [2]:
# Define data path
raw_data_path = Path('../data/raw')

# List all CSV files
csv_files = list(raw_data_path.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files:\n")
for file in csv_files:
    print(f"  - {file.name}")

Found 9 CSV files:

  - olist_sellers_dataset.csv
  - product_category_name_translation.csv
  - olist_orders_dataset.csv
  - olist_order_items_dataset.csv
  - olist_customers_dataset.csv
  - olist_geolocation_dataset.csv
  - olist_order_payments_dataset.csv
  - olist_order_reviews_dataset.csv
  - olist_products_dataset.csv


In [3]:
# Load all datasets
print("Loading datasets...\n")

# Orders
orders = pd.read_csv(raw_data_path / 'olist_orders_dataset.csv')
print(f"✅ Orders: {orders.shape[0]:,} rows × {orders.shape[1]} columns")

# Order Items
order_items = pd.read_csv(raw_data_path / 'olist_order_items_dataset.csv')
print(f"✅ Order Items: {order_items.shape[0]:,} rows × {order_items.shape[1]} columns")

# Products
products = pd.read_csv(raw_data_path / 'olist_products_dataset.csv')
print(f"✅ Products: {products.shape[0]:,} rows × {products.shape[1]} columns")

# Customers
customers = pd.read_csv(raw_data_path / 'olist_customers_dataset.csv')
print(f"✅ Customers: {customers.shape[0]:,} rows × {customers.shape[1]} columns")

# Sellers
sellers = pd.read_csv(raw_data_path / 'olist_sellers_dataset.csv')
print(f"✅ Sellers: {sellers.shape[0]:,} rows × {sellers.shape[1]} columns")

# Payments
payments = pd.read_csv(raw_data_path / 'olist_order_payments_dataset.csv')
print(f"✅ Payments: {payments.shape[0]:,} rows × {payments.shape[1]} columns")

# Reviews
reviews = pd.read_csv(raw_data_path / 'olist_order_reviews_dataset.csv')
print(f"✅ Reviews: {reviews.shape[0]:,} rows × {reviews.shape[1]} columns")

# Geolocation
geolocation = pd.read_csv(raw_data_path / 'olist_geolocation_dataset.csv')
print(f"✅ Geolocation: {geolocation.shape[0]:,} rows × {geolocation.shape[1]} columns")

# Category translation
category_translation = pd.read_csv(raw_data_path / 'product_category_name_translation.csv')
print(f"✅ Category Translation: {category_translation.shape[0]:,} rows × {category_translation.shape[1]} columns")

print(f"\n🎉 Total records across all datasets: {orders.shape[0] + order_items.shape[0] + products.shape[0] + customers.shape[0] + sellers.shape[0] + payments.shape[0] + reviews.shape[0] + geolocation.shape[0]:,}")

Loading datasets...

✅ Orders: 99,441 rows × 8 columns
✅ Order Items: 112,650 rows × 7 columns
✅ Products: 32,951 rows × 9 columns
✅ Customers: 99,441 rows × 5 columns
✅ Sellers: 3,095 rows × 4 columns
✅ Payments: 103,886 rows × 5 columns
✅ Reviews: 99,224 rows × 7 columns
✅ Geolocation: 1,000,163 rows × 5 columns
✅ Category Translation: 71 rows × 2 columns

🎉 Total records across all datasets: 1,550,851


In [4]:
# Detailed look at the main Orders dataset
print("=" * 80)
print("ORDERS DATASET - DETAILED INSPECTION")
print("=" * 80)

print("\n📊 Column names and types:")
print(orders.dtypes)

print("\n📋 First 3 rows:")
display(orders.head(3))

print("\n❓ Missing values:")
missing = orders.isnull().sum()
print(missing[missing > 0])

print("\n📈 Basic statistics:")
print(orders.describe())

ORDERS DATASET - DETAILED INSPECTION

📊 Column names and types:
order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

📋 First 3 rows:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00



❓ Missing values:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
dtype: int64

📈 Basic statistics:
                                order_id                       customer_id  \
count                              99441                             99441   
unique                             99441                             99441   
top     e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
freq                                   1                                 1   

       order_status order_purchase_timestamp    order_approved_at  \
count         99441                    99441                99281   
unique            8                    98875                90733   
top       delivered      2018-04-11 10:48:14  2018-02-27 04:31:10   
freq          96478                        3                    9   

       order_delivered_carrier_date order_delivered_customer_date  \
count                      