## Step 1: Imports and Configuration

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Configura√ß√µes de visualiza√ß√£o
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Carregar configura√ß√µes
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

print("‚úÖ Imports realizados com sucesso!")
print(f"üìä Projeto: {config['project']['name']}")
print(f"üì¶ Vers√£o: {config['project']['version']}")

‚úÖ Imports realizados com sucesso!
üìä Projeto: E-commerce Analytics Dashboard
üì¶ Vers√£o: 1.0.0


## Step 2: Upload Data

In [6]:
data_path = Path('../data/raw/')

# Carregar todos os datasets
print("üìÇ Carregando datasets...")

orders = pd.read_csv(data_path / 'olist_orders_dataset.csv')
order_items = pd.read_csv(data_path / 'olist_order_items_dataset.csv')
customers = pd.read_csv(data_path / 'olist_customers_dataset.csv')
products = pd.read_csv(data_path / 'olist_products_dataset.csv')
sellers = pd.read_csv(data_path / 'olist_sellers_dataset.csv')
payments = pd.read_csv(data_path / 'olist_order_payments_dataset.csv')
reviews = pd.read_csv(data_path / 'olist_order_reviews_dataset.csv')

print(f"‚úÖ Dados carregados!")
print(f"üì¶ Orders: {len(orders):,} registros")
print(f"üì¶ Order Items: {len(order_items):,} registros")
print(f"üë• Customers: {len(customers):,} registros")
print(f"üõçÔ∏è Products: {len(products):,} registros")
print(f"üè™ Sellers: {len(sellers):,} registros")
print(f"üí≥ Payments: {len(payments):,} registros")
print(f"‚≠ê Reviews: {len(reviews):,} registros")

üìÇ Carregando datasets...
‚úÖ Dados carregados!
üì¶ Orders: 99,441 registros
üì¶ Order Items: 112,650 registros
üë• Customers: 99,441 registros
üõçÔ∏è Products: 32,951 registros
üè™ Sellers: 3,095 registros
üí≥ Payments: 103,886 registros
‚≠ê Reviews: 99,224 registros


## Step 3: Order Analysis

In [7]:
print("=" * 80)
print("üìä AN√ÅLISE DE PEDIDOS (ORDERS)")
print("=" * 80)

print("\n1Ô∏è‚É£ Informa√ß√µes Gerais:")
print(orders.info())

print("\n2Ô∏è‚É£ Primeiras linhas:")
display(orders.head())

print("\n3Ô∏è‚É£ Estat√≠sticas Descritivas:")
display(orders.describe())

print("\n4Ô∏è‚É£ Valores Nulos:")
print(orders.isnull().sum())

print("\n5Ô∏è‚É£ Status dos Pedidos:")
print(orders['order_status'].value_counts())

üìä AN√ÅLISE DE PEDIDOS (ORDERS)

1Ô∏è‚É£ Informa√ß√µes Gerais:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
None

2Ô∏è‚É£ Primeiras linhas:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00



3Ô∏è‚É£ Estat√≠sticas Descritivas:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
count,99441,99441,99441,99441,99281,97658,96476,99441
unique,99441,99441,8,98875,90733,81018,95664,459
top,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2018-08-02 12:06:07,2018-02-27 04:31:10,2018-05-09 15:48:00,2018-05-14 20:02:44,2017-12-20 00:00:00
freq,1,1,96478,3,9,47,3,522



4Ô∏è‚É£ Valores Nulos:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

5Ô∏è‚É£ Status dos Pedidos:
order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64


## Step 4: Date Conversion

In [8]:
print("üìÖ Convertendo colunas de data...")

# Converter colunas de data
date_columns = ['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date',
                'order_estimated_delivery_date']

for col in date_columns:
    if col in orders.columns:
        orders[col] = pd.to_datetime(orders[col])

print("‚úÖ Datas convertidas!")
print(f"üìÜ Per√≠odo: {orders['order_purchase_timestamp'].min()} at√© {orders['order_purchase_timestamp'].max()}")


üìÖ Convertendo colunas de data...
‚úÖ Datas convertidas!
üìÜ Per√≠odo: 2016-09-04 21:15:19 at√© 2018-10-17 17:30:18


## Step 5: Temporal Analysis

In [9]:
# Criar colunas de tempo
orders['year'] = orders['order_purchase_timestamp'].dt.year
orders['month'] = orders['order_purchase_timestamp'].dt.month
orders['year_month'] = orders['order_purchase_timestamp'].dt.to_period('M')
orders['day_of_week'] = orders['order_purchase_timestamp'].dt.day_name()

print("\nüìà Distribui√ß√£o Temporal:")
print("\nPedidos por Ano:")
print(orders['year'].value_counts().sort_index())

print("\nPedidos por M√™s (2017-2018):")
monthly_orders = orders.groupby('year_month').size()
print(monthly_orders.tail(12))


üìà Distribui√ß√£o Temporal:

Pedidos por Ano:
year
2016      329
2017    45101
2018    54011
Name: count, dtype: int64

Pedidos por M√™s (2017-2018):
year_month
2017-11    7544
2017-12    5673
2018-01    7269
2018-02    6728
2018-03    7211
2018-04    6939
2018-05    6873
2018-06    6167
2018-07    6292
2018-08    6512
2018-09      16
2018-10       4
Freq: M, dtype: int64
