In [11]:
import pandas as pd
import plotly.express as px
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss

date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv'
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list
    parse_dates = date_cols.get(filename, [])
    # Read the CSV, parsing the specified date columns (if any)
    return pd.read_csv(path, parse_dates=parse_dates)

df_orders           = read_olist_csv('../data/original_data/olist_orders_dataset.csv')
df_customers        = read_olist_csv('../data/original_data/olist_customers_dataset.csv')
df_order_items      = read_olist_csv('../data/original_data/olist_order_items_dataset.csv')
df_order_payments   = read_olist_csv('../data/original_data/olist_order_payments_dataset.csv')
df_reviews          = read_olist_csv('../data/original_data/olist_order_reviews_dataset.csv')
df_products         = read_olist_csv('../data/original_data/olist_products_dataset.csv')
df_prod_cat_tr      = read_olist_csv('../data/original_data/product_category_name_translation.csv')
df_sellers          = read_olist_csv('../data/original_data/olist_sellers_dataset.csv')
df_geolocation      = read_olist_csv('../data/original_data/olist_geolocation_dataset.csv')

# Cleaned dataset
df_orders_delivered_clean       = read_olist_csv('../data/cleaned_data/olist_orders_dataset.csv')
df_geolocation_clean            = read_olist_csv('../data/cleaned_data/olist_geolocation_dataset.csv')
df_sellers_cleaned              = read_olist_csv('../data/cleaned_data/olist_sellers_dataset.csv')
df_products_cleaned             = read_olist_csv('../data/cleaned_data/olist_products_dataset.csv')
df_order_items_delivered_clean  = read_olist_csv('../data/cleaned_data/olist_order_items_dataset.csv')
df_order_payments_delivered_clean = read_olist_csv('../data/cleaned_data/olist_order_payments_dataset.csv')
df_reviews_delivered_clean       = read_olist_csv('../data/cleaned_data/olist_sellers_dataset.csv')

In [12]:
# Load the products dataset:
df_products.head(10)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
5,41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60.0,745.0,1.0,200.0,38.0,5.0,11.0
6,732bd381ad09e530fe0a5f457d81becb,cool_stuff,56.0,1272.0,4.0,18350.0,70.0,24.0,44.0
7,2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56.0,184.0,2.0,900.0,40.0,8.0,40.0
8,37cc742be07708b53a98702e77a21a02,eletrodomesticos,57.0,163.0,1.0,400.0,27.0,13.0,17.0
9,8c92109888e8cdf9d66dc7e463025574,brinquedos,36.0,1156.0,1.0,600.0,17.0,10.0,12.0


| Column Name                  | Description                                                                 |
|-----------------------------|-----------------------------------------------------------------------------|
| `product_id`                | Unique identifier for each product.                                         |
| `product_category_name`    | Name of the product category (in Portuguese).                 |
| `product_name_lenght`      | Number of characters in the product name.                                   |
| `product_description_lenght` | Number of characters in the product description.                           |
| `product_photos_qty`       | Number of photos available for the product.                                 |
| `product_weight_g`         | Weight of the product in grams.                                             |
| `product_length_cm`        | Length of the product's packaging in centimeters.                           |
| `product_height_cm`        | Height of the product's packaging in centimeters.                           |
| `product_width_cm`         | Width of the product's packaging in centimeters.                            |

In [13]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [14]:
df_products.describe()

Unnamed: 0,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
count,32341.0,32341.0,32341.0,32949.0,32949.0,32949.0,32949.0
mean,48.476949,771.495285,2.188986,2276.472488,30.815078,16.937661,23.196728
std,10.245741,635.115225,1.736766,4282.038731,16.914458,13.637554,12.079047
min,5.0,4.0,1.0,0.0,7.0,2.0,6.0
25%,42.0,339.0,1.0,300.0,18.0,8.0,15.0
50%,51.0,595.0,1.0,700.0,25.0,13.0,20.0
75%,57.0,972.0,3.0,1900.0,38.0,21.0,30.0
max,76.0,3992.0,20.0,40425.0,105.0,105.0,118.0


In [15]:
summary = []

for col in df_products.columns:
    unique_vals = df_products[col].dropna().unique()
    summary.append({
        'Column': col,
        'Unique Count': len(unique_vals),
        'Unique Values': unique_vals
    })

df_summary = pd.DataFrame(summary)
df_summary

Unnamed: 0,Column,Unique Count,Unique Values
0,product_id,32951,"[1e9e8ef04dbcff4541ed26657ea517e5, 3aa071139cb..."
1,product_category_name,73,"[perfumaria, artes, esporte_lazer, bebes, util..."
2,product_name_lenght,66,"[40.0, 44.0, 46.0, 27.0, 37.0, 60.0, 56.0, 57...."
3,product_description_lenght,2960,"[287.0, 276.0, 250.0, 261.0, 402.0, 745.0, 127..."
4,product_photos_qty,19,"[1.0, 4.0, 2.0, 3.0, 5.0, 9.0, 6.0, 7.0, 12.0,..."
5,product_weight_g,2204,"[225.0, 1000.0, 154.0, 371.0, 625.0, 200.0, 18..."
6,product_length_cm,99,"[16.0, 30.0, 18.0, 26.0, 20.0, 38.0, 70.0, 40...."
7,product_height_cm,102,"[10.0, 18.0, 9.0, 4.0, 17.0, 5.0, 24.0, 8.0, 1..."
8,product_width_cm,95,"[14.0, 20.0, 15.0, 26.0, 13.0, 11.0, 44.0, 40...."


In [16]:
df_products.duplicated().sum()

0

In [17]:
df_products['product_id'].is_unique

True

In [18]:
df_products.isna().sum()

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

## Data Cleaning

### Missing SKUs (description, name, etc.)

In [19]:
# List of columns with 610 missing values:
cols_610 = [
    'product_category_name',
    'product_name_lenght',
    'product_description_lenght',
    'product_photos_qty'
]

# Check which rows have all these columns missing:
missing_mask = df_products[cols_610].isnull().all(axis=1)

# Count how many rows match this condition:
missing_rows_count = missing_mask.sum()
print(f"Number of rows where all 4 columns are missing: {missing_rows_count}")

# Show:
df_missing_rows = df_products[missing_mask]
df_missing_rows

Number of rows where all 4 columns are missing: 610


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
105,a41e356c76fab66334f36de622ecbd3a,,,,,650.0,17.0,14.0,12.0
128,d8dee61c2034d6d075997acef1870e9b,,,,,300.0,16.0,7.0,20.0
145,56139431d72cd51f19eb9f7dae4d1617,,,,,200.0,20.0,20.0,20.0
154,46b48281eb6d663ced748f324108c733,,,,,18500.0,41.0,30.0,41.0
197,5fb61f482620cb672f5e586bb132eae9,,,,,300.0,35.0,7.0,12.0
...,...,...,...,...,...,...,...,...,...
32515,b0a0c5dd78e644373b199380612c350a,,,,,1800.0,30.0,20.0,70.0
32589,10dbe0fbaa2c505123c17fdc34a63c56,,,,,800.0,30.0,10.0,23.0
32616,bd2ada37b58ae94cc838b9c0569fecd8,,,,,200.0,21.0,8.0,16.0
32772,fa51e914046aab32764c41356b9d4ea4,,,,,1300.0,45.0,16.0,45.0


First, let's check if these products have ever been sold before (included in delivered orders):

In [20]:
# 1. Build the blank-metadata mask & ID list
missing_mask = (
    df_products[['product_category_name',
                 'product_name_lenght',
                 'product_description_lenght',
                 'product_photos_qty']]
    .isnull().all(axis=1)
)
blank_ids = df_products.loc[missing_mask, 'product_id']

# 2. Filter order_items down to only delivered orders
delivered_set = set(df_orders_delivered_clean['order_id'])
delivered_items = df_order_items[df_order_items['order_id'].isin(delivered_set)]

# 3. Among those delivered items, pick only the blank SKUs
sold_blank = delivered_items[
    delivered_items['product_id'].isin(blank_ids)
]

# 4. Count distinct delivered orders per blank SKU
sku_freq = (
    sold_blank
    .groupby('product_id')['order_id']
    .nunique()
    .reset_index(name='delivered_order_count')
    .sort_values('delivered_order_count', ascending=False)
)

print(sku_freq)

                           product_id  delivered_order_count
204  5a848e4ab52fd5445cdc07aab1c40e48                    187
385  b1d207586fca400a2370d50a9ba1da98                     42
260  76d1a1a9d21ab677a61c3ae34b1b352f                     32
373  ad88641611c35ebd59ecda07a9f17099                     28
133  3b60d513e90300a4e9833e5cda1f1d61                     27
..                                ...                    ...
221  62fde58e97724f8b7519f3789eefa33f                      1
220  62ea5d617ff74ab3e446aa418e7f67ed                      1
217  61e9a389f1bef5181fc96e33f0dc16a6                      1
216  61a15da36ca0448be0565f8ee41dd37e                      1
582  fff28f91211774864a1000f918ed00cc                      1

[583 rows x 2 columns]


Even though 610 of products contain blank SKUs (description, name, etc.), most of them have actually been sold and delivered to a customer, with 1 of them even being sold 187 times. These products will be valuable information for our delivery analysis, so we won't remove them.

However, there are 27 products that haven't been sold and delivered to a customer, so they are safely removed as they do not bring any valuable information in the analysis.

In [21]:
blank_ids_set = set(blank_ids)

# The ones that *did* sell in delivered orders
sold_blank_ids = set(sku_freq['product_id'])

# Unsold in delivered orders
drop_ids = blank_ids_set - sold_blank_ids

drop_ids

{'017457b0013d01d5a5a4a020ed1f14b9',
 '05cf9ac595f28386ee763c98cbc2bad0',
 '0c40401eba358c9ef2ff2df80b6eab52',
 '29fffacd0907d0944b7af35086efdb43',
 '2c24793fa7743021c907ec930fce14b9',
 '32a99bb84ae79a55c0e446afc9310e1e',
 '3d103df80d5d22d709244fe27bc29a8d',
 '3f6f946481fd39f4eda986012f6e0447',
 '44483ca1c9223756ee0ee71c0676d3a3',
 '4ba237d0c64f22e4d22ec19af9cfd1f4',
 '6868e3f08c6acd37b48c785aedddfdb7',
 '794de06c32a626a5692ff50e4985d36f',
 '7f776a76a6dce545729c997e60240ffa',
 '96aca2f53bcaed6f466449f7fb18ae75',
 '99c3deeef8923c1036a30a5abed9071c',
 '9f6e8441548dd40516cdb98f1b29c3d5',
 'a08ffa6ecdd0ab3c1f4fe348acc2553b',
 'a8e59319e3c44b5af3a5412d713af5bb',
 'b52385de7f52a8dc46d4195e2f77f965',
 'b5d652cabeb01249ad1cf910eef3230b',
 'c647d965c3bd45fa151a61dc4233b7cc',
 'c73ce46ad401dcf7c05da1cc8a65dea2',
 'cdd63834cc3da0f9225cc20f494dc137',
 'ceda8168fb181a91487d6e924f42334f',
 'e3c816666a7d2a1e7fbf02651e550b78',
 'e987172c2818cc16b2555bc130ac5fac',
 'f52a0f70e54976873a4a7402349c6105'}

In [22]:
df_products_clean = df_products[~df_products['product_id'].isin(drop_ids)].copy()

For the 583 products, we can simply impute the missing values with a 0 value to indicate missingness. Only `category_name` is the column that could be useful in the delivery analysis, so this column will be filled with "Unknown" value. The rest of the column will be filled with 0, as they do not impact our analysis.

In [23]:
# Boolean mask for the 583 SKUs
missing_mask = (
    df_products_clean[['product_category_name',
                 'product_name_lenght',
                 'product_description_lenght',
                 'product_photos_qty']]
    .isnull()
    .all(axis=1)
)

df_products_clean.loc[missing_mask, 'product_category_name'] = 'unknown_category'

num_cols = [
    'product_name_lenght',
    'product_description_lenght',
    'product_photos_qty'
]
df_products_clean.loc[missing_mask, num_cols] = 0

In [24]:
# Double check:
df_products_clean.isna().sum()

product_id                    0
product_category_name         0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
product_weight_g              2
product_length_cm             2
product_height_cm             2
product_width_cm              2
dtype: int64

### Missing dimensions & weight

In [25]:
# Find rows with any missing values
missing_rows = df_products_clean[df_products_clean.isna().any(axis=1)]

display(missing_rows)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
8578,09ff539a621711667c43eba6a3bd8466,bebes,60.0,865.0,3.0,,,,
18851,5eb564652db742ff8f28759cd8d2652a,unknown_category,0.0,0.0,0.0,,,,


There are still 2 products with missing values, let's check their sold frequency

In [26]:
# 1. Build the blank-metadata mask & ID list
missing_mask = (
    df_products_clean[['product_weight_g',
                                   'product_length_cm',
                                   'product_height_cm',
                                   'product_width_cm']].isna().any(axis=1)
)
blank_ids = df_products_clean.loc[missing_mask, 'product_id']

# 2. Filter order_items down to only delivered orders
delivered_set = set(df_orders_delivered_clean['order_id'])
delivered_items = df_order_items[df_order_items['order_id'].isin(delivered_set)]

# 3. Among those delivered items, pick only the blank SKUs
sold_blank = delivered_items[
    delivered_items['product_id'].isin(blank_ids)
]

# 4. Count distinct delivered orders per blank SKU
sku_freq = (
    sold_blank
    .groupby('product_id')['order_id']
    .nunique()
    .reset_index(name='delivered_order_count')
    .sort_values('delivered_order_count', ascending=False)
)

print(sku_freq)

                         product_id  delivered_order_count
1  5eb564652db742ff8f28759cd8d2652a                     15
0  09ff539a621711667c43eba6a3bd8466                      1


Even though product id '5eb564652db742ff8f28759cd8d2652a' has 15 delivered orders, it is missing EVERY feature and thus can't be imputed. This product can't be used for analysis and have to be removed.

The other product, however, has a category of 'bebes', let's check if there's enough information about that category to impute this product

In [27]:
df_products_clean[df_products_clean['product_category_name'] == 'bebes'].describe()

Unnamed: 0,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
count,919.0,919.0,919.0,918.0,918.0,918.0,918.0
mean,46.056583,824.261153,2.352557,3655.201525,37.147059,21.617647,28.717865
std,11.382298,544.41453,1.735121,5665.129577,17.94573,16.591138,13.964589
min,7.0,4.0,1.0,50.0,14.0,2.0,8.0
25%,38.0,381.0,1.0,400.0,23.0,10.0,16.0
50%,49.0,728.0,2.0,850.0,33.0,16.0,25.0
75%,55.5,1133.0,3.0,5106.25,45.0,30.0,40.0
max,64.0,3923.0,19.0,30000.0,102.0,97.0,84.0


There are 919 'bebes' product in the dataset, these descriptive values can be used to impute the missing dimensions, so that the product is still useful for analysis.

In [28]:
drop_id  = '5eb564652db742ff8f28759cd8d2652a'     # missing absolutely everything
impute_id = '09ff539a621711667c43eba6a3bd8466'    # only dimensions missing

# 1 · DROP SKU A completely -------------------------------------------
df_products_clean = df_products_clean[df_products_clean['product_id'] != drop_id]

# 2 · IMPUTE dimensions for SKU B -------------------------------------
dims = ['product_weight_g','product_length_cm','product_height_cm','product_width_cm']

# calculate category medians for "bebes"
bebes_medians = df_products[df_products['product_category_name']=='bebes'][dims].median()

# fill the missing values
df_products_clean.loc[df_products_clean['product_id']==impute_id, dims] = bebes_medians.values

In [29]:
# Double check:
df_products_clean.isna().sum()

product_id                    0
product_category_name         0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
product_weight_g              0
product_length_cm             0
product_height_cm             0
product_width_cm              0
dtype: int64

In [30]:
df_products_clean.to_csv('../data/cleaned_data/olist_products_dataset.csv', index=False)

## Data Analysis

In [31]:
df_products = df_products_clean.copy()

In [32]:
# Select only numeric columns for correlation:
numeric_cols = [
    'product_name_lenght',
    'product_description_lenght',
    'product_photos_qty',
    'product_weight_g',
    'product_length_cm',
    'product_height_cm',
    'product_width_cm'
]

# Compute the correlation matrix:
corr_matrix = df_products[numeric_cols].corr()

# Create the heatmap:
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu',
    title='Correlation Heatmap of Product Features'
)

fig.update_layout(
    autosize=True,
    width=700,
    height=600,
    margin=dict(l=40, r=40, t=60, b=40),
    xaxis_title=None,
    yaxis_title=None
)

fig.show()

## Univariate Analysis:

In [33]:
# Distribution of product_photos_qty:
fig = px.histogram(df_products, x='product_photos_qty', nbins=10, title='Photo Quantity Distribution')
fig.show()

In [34]:
# Top 10 product categories:
top_cats = df_products['product_category_name'].value_counts().nlargest(10).reset_index()
top_cats.columns = ['product_category_name', 'count']

# Plot:
fig = px.bar(
    top_cats,
    x='product_category_name',
    y='count',
    title='Top 10 Product Categories'
)
fig.update_layout(xaxis_title='Category', yaxis_title='Count')
fig.show()

## Bivariate Analysis

In [35]:
# Box plot of product weight by category:
top_categories = df_products['product_category_name'].value_counts().nlargest(10).index
filtered_df = df_products[df_products['product_category_name'].isin(top_categories)]

fig = px.box(filtered_df, x='product_category_name', y='product_weight_g',
             title='Product Weight by Category')
fig.show()

In [36]:
# Scatter plot of name length vs description length:
fig = px.scatter(df_products, x='product_name_lenght', y='product_description_lenght',
                 title='Name vs Description Length')
fig.show()