In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
customers = pd.read_csv('data/olist_customers_dataset.csv')
orders = pd.read_csv('data/olist_orders_dataset.csv')
order_items = pd.read_csv('data/olist_order_items_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
sellers = pd.read_csv('data/olist_sellers_dataset.csv')
payments = pd.read_csv('data/olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
geolocation = pd.read_csv('data/olist_geolocation_dataset.csv')
reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
category_translation = pd.read_csv('data/product_category_name_translation.csv')

In [3]:
import pandas as pd

def inspect_data(df, name="DataFrame"):
    """Simple but comprehensive data inspection"""
    
    print(f"\n{'='*60}")
    print(f"INSPECTING: {name}")
    print(f"{'='*60}\n")
    
    # Basic info
    print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]} columns\n")
    
    # Column overview
    print("Column Overview:")
    print(f"{'Column':<30} {'Type':<15} {'Nulls':<10} {'Unique':<10}")
    print("-" * 65)
    for col in df.columns:
        dtype = str(df[col].dtype)
        nulls = f"{df[col].isnull().sum()} ({df[col].isnull().sum()/len(df)*100:.1f}%)"
        unique = df[col].nunique()
        print(f"{col:<30} {dtype:<15} {nulls:<10} {unique:<10}")
    
    # Missing data
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if len(missing) > 0:
        print(f"\n⚠️  Columns with missing data:")
        for col, count in missing.items():
            print(f"  • {col}: {count:,} ({count/len(df)*100:.1f}%)")
    
    # Duplicates
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        print(f"\n⚠️  Duplicate rows: {dup_count:,} ({dup_count/len(df)*100:.1f}%)")
    
    # Sample data
    print("\nFirst 3 rows:")
    print(df.head(3))
    
    print("\n" + "="*60 + "\n")


# Usage - inspect all dataframes at once
dataframes = {
    'customers': customers,
    'orders': orders,
    'order_items': order_items,
    'products': products,
    'sellers': sellers,
    'payments': payments,
    'order_reviews': order_reviews,
    'geolocation': geolocation,
    'category_translation': category_translation
}

# Inspect all
for name, df in dataframes.items():
    inspect_data(df, name)

# Or inspect one at a time
# inspect_data(customers, 'customers')


INSPECTING: customers

Shape: 99,441 rows x 5 columns

Column Overview:
Column                         Type            Nulls      Unique    
-----------------------------------------------------------------
customer_id                    object          0 (0.0%)   99441     
customer_unique_id             object          0 (0.0%)   96096     
customer_zip_code_prefix       int64           0 (0.0%)   14994     
customer_city                  object          0 (0.0%)   4119      
customer_state                 object          0 (0.0%)   27        

First 3 rows:
                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   

   customer_zip_code_prefix          customer_city customer_state  
0                     14409                 franca             SP  
1  

In [1]:
#mike