In [32]:
import pandas as pd
import plotly.express as px
import os

In [33]:
# To be parsed:
date_cols = {
    'olist_order_items_dataset.csv': ['shipping_limit_date']
}

In [34]:
def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    filename = os.path.basename(path)
    parse_dates = date_cols.get(filename, [])
    return pd.read_csv(path, parse_dates=parse_dates)

In [None]:
# Load the order items dataset:
df_order_items = read_olist_csv("archive/olist_order_items_dataset.csv")
df_order_items.head(10)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14
5,00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23 03:55:27,21.9,12.69
6,00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14 12:10:31,19.9,11.85
7,000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10 12:30:45,810.0,70.75
8,0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26 18:31:29,145.95,11.65
9,0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06 14:10:56,53.99,11.4


| Column Name           | Description                                                                 |
|-----------------------|-----------------------------------------------------------------------------|
| `order_id`            | Unique identifier for the customer order. Multiple items can share an order ID. |
| `order_item_id`       | Sequential number of the item in the order (e.g., 1st item, 2nd item, etc.). |
| `product_id`          | Unique identifier for the specific product being purchased.                 |
| `seller_id`           | Unique identifier for the seller offering the product.                      |
| `shipping_limit_date` | Deadline by which the seller must ship the product to meet delivery SLA.    |
| `price`               | Price paid by the customer for the item (in BRL, excluding freight).         |
| `freight_value`       | Shipping cost for this item (in BRL).                |

In [36]:
df_order_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


In [37]:
df_order_items.describe()

Unnamed: 0,order_item_id,shipping_limit_date,price,freight_value
count,112650.0,112650,112650.0,112650.0
mean,1.197834,2018-01-07 15:36:52.192685312,120.653739,19.99032
min,1.0,2016-09-19 00:15:34,0.85,0.0
25%,1.0,2017-09-20 20:57:27.500000,39.9,13.08
50%,1.0,2018-01-26 13:59:35,74.99,16.26
75%,1.0,2018-05-10 14:34:00.750000128,134.9,21.15
max,21.0,2020-04-09 22:35:08,6735.0,409.68
std,0.705124,,183.633928,15.806405


In [38]:
summary = []

for col in df_order_items.columns:
    unique_vals = df_order_items[col].dropna().unique()
    summary.append({
        'Column': col,
        'Unique Count': len(unique_vals),
        'Unique Values': unique_vals
    })

df_summary = pd.DataFrame(summary)
df_summary

Unnamed: 0,Column,Unique Count,Unique Values
0,order_id,98666,"[00010242fe8c5a6d1ba2dd792cb16214, 00018f77f2f..."
1,order_item_id,21,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,product_id,32951,"[4244733e06e7ecb4970a6e2683c13e61, e5f2d52b802..."
3,seller_id,3095,"[48436dade18ac8b2bce089ec2a041202, dd7ddc04e1b..."
4,shipping_limit_date,93318,"[2017-09-19 09:45:35, 2017-05-03 11:05:13, 201..."
5,price,5968,"[58.9, 239.9, 199.0, 12.99, 199.9, 21.9, 19.9,..."
6,freight_value,6999,"[13.29, 19.93, 17.87, 12.79, 18.14, 12.69, 11...."


In [39]:
df_order_items.isna().sum()

order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

In [40]:
df_order_items.duplicated().sum()

0

In [41]:
# Group by order_id to get total order price:
df_order_price = df_order_items.groupby('order_id', as_index=False)['price'].sum()

df_order_price['price_category'] = df_order_price['price'].apply(
    lambda x: 'Below R$79' if x < 79 else 'R$79 and Above'
)

order_price_summary = df_order_price['price_category'].value_counts(normalize=True).reset_index()
order_price_summary.columns = ['Price Category', 'Percentage']
order_price_summary['Percentage'] *= 100

fig = px.pie(
    order_price_summary,
    names='Price Category',
    values='Percentage',
    title='Percentage of Orders by Total Order Price',
    color_discrete_sequence=['#1f77b4', '#0d3d73']
)
fig.show()