In [None]:
%%javascript
// Add CSS styles for all heading levels
const style = document.createElement('style');
style.innerHTML = `
    /* Style for h1 */
    h1 {
        color: white !important;
        font-size: 24px !important;
        font-weight: 600 !important;
        background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%) !important;
        padding: 15px 0 15px 20px !important;
        margin: 0 -15px 10px 0 !important;
        border-radius: 0 12px 12px 0 !important;
    }
    
    /* Style for h2 */
    h2 {
        color: #185a9d !important;
        font-size: 22px !important;
        font-weight: 600 !important;
        margin: 0 0 15px -20px !important;
        padding-bottom: 8px !important;
        border-bottom: 2px solid #185a9d !important;
    }
    
    /* Style for h3 */
    h3 {
        color: #2d7f9d !important;
        font-size: 20px !important;
        font-weight: 600 !important;
        margin: 0 0 0 -43px !important;
        padding-bottom: 6px !important;
    }
    
    /* Style for h4 */
    h4 {
        color: #127852 !important;
        font-size: 18px !important;
        font-weight: 600 !important;
        margin: 0 0 0 -60px !important;
    }
    
    /* Indentation for headings */
    h2::before { content: "\\00a0\\00a0\\00a0\\00a0\\00a0"; }
    h3::before { content: "\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0"; }
    h4::before { content: "\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0\\00a0"; }
    
    /* Style for "Back" button */
    .auto-back-link {
        color: white !important;
        text-decoration: none !important;
        background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%) !important;
        padding: 6px 15px !important;
        border-radius: 4px !important;
        font-weight: 600 !important;
        font-size: 13px !important;
        display: inline-block !important;
        margin-left: 15px !important;
        transition: all 0.3s !important;
        border: 1px solid rgba(255,255,255,0.2) !important;
    }
    
    .auto-back-link:hover {
        opacity: 0.8 !important;
    }
    
    /* Style for horizontal lines */
    .auto-hr-h3 {
        border: none !important;
        height: 2px !important;
        background: #2d7f9d !important;
        margin: 5px 0 15px -20px !important;
    }
    
    .auto-hr-h4 {
        border: none !important;
        height: 2px !important;
        background: #127852 !important;
        margin: 5px 0 15px -20px !important;
    }
    /*Style for converted H3 (now h1) */
    h1.converted-h3 {
        color: #2d7f9d !important;
        font-size: 20px !important;
        font-weight: 600 !important;
        margin: 0 0 0 -43px !important;
        padding-bottom: 6px !important;
        background: none !important; /*Remove the gradient */
        border-radius: 0 !important;
        padding: 0 !important;
    }  
`;
document.head.appendChild(style);

// Function to process all headers
function processHeaders() {  
    // Process h2 headers
    document.querySelectorAll('h2').forEach(h2 => {
        // Always add back button to h2
        if (!h2.nextElementSibling?.classList?.contains('auto-back-link')) {
            const backLink = document.createElement('a');
            backLink.href = '#toc';
            backLink.className = 'auto-back-link';
            backLink.textContent = '↑ Back to Contents';
            backLink.onmouseover = () => backLink.style.opacity = '0.8';
            backLink.onmouseout = () => backLink.style.opacity = '1';
            h2.insertAdjacentElement('afterend', backLink);
        }
    });
    
    // Process h3 headers
    document.querySelectorAll('h3').forEach(h3 => {
        // Add horizontal line
        if (!h3.nextElementSibling?.classList?.contains('auto-hr-h3')) {
            const hr = document.createElement('hr');
            hr.className = 'auto-hr-h3';
            h3.insertAdjacentElement('afterend', hr);
        }
    });
    
    // Process h4 headers
    document.querySelectorAll('h4').forEach(h4 => {
        // Add horizontal line
        if (!h4.nextElementSibling?.classList?.contains('auto-hr-h4')) {
            const hr = document.createElement('hr');
            hr.className = 'auto-hr-h4';
            h4.insertAdjacentElement('afterend', hr);
        }
    });

    // Convert h3 to h1 for kaggle table of contents on the right after all other processing is done
    convertH3ToH1();
}

// Function to convert h3 to h1
function convertH3ToH1() {
    document.querySelectorAll('h3').forEach(h3 => {
        const h1 = document.createElement('h1');
        // Add spaces directly to the header text
        h1.innerHTML = '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;' + h3.innerHTML;
        h1.className = 'converted-h3';
        
        // We transfer all attributes
        Array.from(h3.attributes).forEach(attr => {
            if (attr.name !== 'class') {
                h1.setAttribute(attr.name, attr.value);
            }
        });
        
        // We transfer the classes (except for converted-H3 if already)
        h3.classList.forEach(cls => {
            if (cls !== 'converted-h3') h1.classList.add(cls);
        });
        
        // Replace H3 of H1
        h3.parentNode.replaceChild(h1, h3);
        
        // Transfer the horizontal line if there is
        const next = h1.nextElementSibling;
        if (next && next.classList.contains('auto-hr-h3')) {
            h1.parentNode.insertBefore(next, h1.nextSibling);
        }
    });
}

// Run the initial processing
processHeaders();

// Set up mutation observer to handle dynamic content changes
const observer = new MutationObserver(processHeaders);
observer.observe(document.body, {
    childList: true,
    subtree: true
});

// Additional run after 1 second as a fallback
setTimeout(processHeaders, 1000);

// Delete this cell
(function() {
    // Find all code cells in the notebook
    const codeCells = document.querySelectorAll('.code_cell');
    
    // Loop through each code cell
    codeCells.forEach(cell => {
        // Get the text content of the cell (cross-browser compatible)
        const codeText = cell.innerText || cell.textContent;
        
        // Check if this cell contains "%%javascript" magic command
        if (codeText.includes('%%javascript')) {
            // If found - remove the entire cell from DOM
            cell.remove(); 
        }
    });
})();

<span style="display: block;
           color: white;
           font-size: 28px;
           font-weight: 800;
           background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
           padding: 20px;
           box-shadow: 0 4px 12px rgba(0,0,0,0.15);
           letter-spacing: 1px;">
🌊 Deep Sales Analysis of Olist Marketplace
</span>

**Author:**  

Pavel Grigoryev

**Project Description:**   

Olist is a Brazilian e-commerce platform that connects sellers and buyers, offering a wide range of products and convenient conditions for online sales. Olist also acts as an intermediary, allowing sellers to connect to multiple marketplaces simultaneously, thereby increasing their reach.

This project conducts an in-depth analysis of sales data on the Olist platform. The analysis focuses on identifying key trends, patterns in customer behavior, and operational insights. The findings will help formulate recommendations to improve business processes and enhance sales efficiency.

**Goal:**  

The goal of the analysis is to identify key trends, patterns, and insights in sales data, customer behavior, and the effectiveness of marketing strategies on the Olist platform. The results will help develop recommendations to improve business strategies and enhance customer experience.

**Project Resources:**  

- [**Presentation Slides**](https://docs.google.com/presentation/d/1sOYi3MWXedIEnuSn41H8lBeZ9aGnnTi5iV-DEMbfCvc/present)
- [**Web Report**](https://pavelgrigoryevds.github.io/olist-deep-dive/)
- [**GitHub Repo**](https://github.com/PavelGrigoryevDS/olist-deep-dive)
  
**Data Sources:**  

Sales data from the Olist marketplace.

**Analysis Timeframe:**  

Data from September 2016 to October 2018 will be used for the analysis.

**Main Conclusions:**  

- Sales Trends:
  - **Growth & Stabilization**: Sales volume and revenue grew until 2018, then stabilized at 6–7K orders and 1–1.2M R$ per month.
  - **Black Friday (11/24/2017)**: Record spikes in orders, revenue, and buyers.
  - **Geography**: Sao Paulo dominates (42% of sales), with steady growth in 2018, unlike other regions.

- Customer Behavior:
  - **Low Retention**: 97% of buyers made only one purchase; repeat buyers are rare.
  - **High-Value Buyers**: Clients using installment plans (50%) spend 2x more (higher average order value/weight).
  - **Loyalty**: Promoters (58% of buyers) leave positive reviews but rarely return. Critics (13%) spend more but churn faster due to delivery delays.

- Operational Insights:
  - Delayed orders correlate with lower ratings (avg. rating: 1–2 vs. 4–5 for on-time).
  - Heavy/expensive orders take longer to deliver and are more likely to be delayed.
  - Orders with installments process faster, have higher AOV, and show better retention.

- Payment & Risk:
  - **Credit cards dominate**: 74% of transactions, with 35% higher AOV vs. other methods.
  - **Installments boost value**: Orders with installments have 2x higher AOV (premium/heavy items).
  - **Voucher Payments**: Orders paid with vouchers have 3x higher cancellation rates (16% vs. 5% for credit cards).

- Product & Logistics:
  - **Top Categories**: Electronics (27% of sales) and furniture (18%) drive revenue.
  - Northern states take 2x longer delivery.
  - Heavy orders (+40% delivery time) and premium items face delays.
  - **Delivery Bottlenecks**: 70% of total delivery time is spent with carriers, notably slower in Rio de Janeiro and Salvador.

- Critical Challenges:
  - **Declining Ratings**: Average review scores dropped from 4.5 (2017) to 3.9 (2018), linked to delivery delays.
  - **Peak Season Failures**: Black Friday 2017 caused a surge in delayed deliveries, with complaints tied to carrier handoff bottlenecks.
  - **Abandoned Carts**: Canceled orders spike in February/August 2018, often for high-value items paid via vouchers.

- Customer Feedback & Ratings:
  - **Majority of reviews are positive**: 58% of reviews received a rating of 5. Only 12% of reviews received a rating of 1, while a mere 3% received a rating of 2.
  - **Negative Reviews**: 15% of review text mentions "slow delivery" or "missing items," heavily impacting NPS.

- Data Highlights:
  - **Negative Feedback Drivers**: Low ratings correlate with longer delivery times, higher order value, and heavier items.
  - **Success Factors**: Fast carrier handoff (≤3 days) and installment options boost ratings and repeat purchases.

**Key Recommendations:**

- Boost Customer Retention & Repeat Purchases:
  - Launch a loyalty program targeting one-time buyers (97% of customers), offering discounts on second purchases or bonus points.
  - Personalized win-back campaigns for high-value clients (top 1% driving 15% of revenue) with exclusive offers.
  - Reduce time between purchases (currently 29+ days for 50% of repeat buyers) via time-bound promotions (e.g., "7-day discount").
  
- Improve Product & Pricing Strategy:
  - Expand "Beauty & Health" and "Home & Garden" (18% YoY growth categories) with curated bundles or subscriptions.
  - Reprice problem categories (e.g., "Watches & Gifts") to offset delivery costs or offer guaranteed faster shipping.

- Enhance High-Value Segments:
  - Premium installment plans for big spenders (avg. 3+ orders) with perks like free shipping or priority support.
  - Target voucher users (3x higher cancellation risk) with limited-time combo deals to convert abandoned carts.

- Fix Delivery Pain Points:
  - Prioritize carrier performance in critical regions (e.g., Rio de Janeiro, Salvador), where delays are 30% longer than average.
  - Expedite high-value/heavy orders (>500 R$ or >10kg), which face 2x more 1-star ratings due to delays.
  - Optimize Black Friday logistics to prevent repeat of 2017’s 4x surge in delays (pre-stock inventory, add temporary carriers).

- Regional Growth Tactics:
  - Hyper-local campaigns in São Paulo (42% of sales): Leverage its 20% faster delivery and 30% higher retention to test scalable models.
  - Fix underperformers (e.g., Maranhão, Ceará) with subsidized shipping or partner pickup points.

- Mitigate Negative Reviews:
    - Automate compensation for delayed orders (e.g., 10% off next purchase if delivery exceeds 15 days).
    - Sunday support surge: Add staff to cut response times, reducing low weekend ratings.

<a id="toc"></a>
<div style="color: white;
           font-size: 20px;
           font-weight: 700;
           background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
           padding: 10px 15px;
           margin: 20px -10px 8px 0;
           border-radius: 0 8px 8px 0;">
    Table of Contents
</div>

<div style="margin-left: 20px;
            margin-bottom: 20px;
            line-height: 1.4;">
    <!-- Level 1 -->
    <div style="font-size: 15px;
                font-weight: 600;
                margin: 8px 0 8px 0;">
        <a href="#1" 
           style="color: white;
                  text-decoration: none;
                  display: inline-block;
                  background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
                  padding: 2px 10px;
                  border-radius: 4px;"
            onmouseover="this.style.opacity='0.8'" 
            onmouseout="this.style.opacity='1'">
            1 Importing Libraries
        </a>
    </div>
    <!-- Level 1 -->
    <div style="font-size: 15px;
                font-weight: 600;
                margin: 8px 0 8px 0;">
        <a href="#2" 
           style="color: white;
                  text-decoration: none;
                  display: inline-block;
                  background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
                  padding: 2px 10px;
                  border-radius: 4px;"
            onmouseover="this.style.opacity='0.8'" 
            onmouseout="this.style.opacity='1'">
            2 Data Description and Exploration
        </a>
    </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#2-1" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                2.1 Data Description
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#2-2" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                2.2 Data Loading
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#2-3" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                2.3 Data Exploration
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#2-4" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                2.4 Intermediate Conclusion
            </a>
        </div>
</div>
    <!-- Level 1 -->
    <div style="font-size: 15px;
                font-weight: 600;
                margin: 8px 0 8px 0;">
        <a href="#3" 
           style="color: white;
                  text-decoration: none;
                  display: inline-block;
                  background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
                  padding: 2px 10px;
                  border-radius: 4px;"
            onmouseover="this.style.opacity='0.8'" 
            onmouseout="this.style.opacity='1'">
            3 Data Preprocessing
        </a>
    </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-1" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.1 Initial Data Filtering
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-2" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.2 Outlier Handling
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-3" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.3 Missing Value Handling
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-4" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.4 Duplicate Handling
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-5" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.5 Creating New Metrics
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-6" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.6 Creating New Dimensions
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-7" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.7 Converting Data to a Convenient Format
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-8" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.8 Data Merging
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#3-9" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                3.9 Intermediate Conclusion
            </a>
        </div>
</div>
    <!-- Level 1 -->
    <div style="font-size: 15px;
                font-weight: 600;
                margin: 8px 0 8px 0;">
        <a href="#4" 
           style="color: white;
                  text-decoration: none;
                  display: inline-block;
                  background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
                  padding: 2px 10px;
                  border-radius: 4px;"
            onmouseover="this.style.opacity='0.8'" 
            onmouseout="this.style.opacity='1'">
            4 Data Analysis
        </a>
    </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-1" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.1 Creating Analysis Class
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-2" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.2 Time Series Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-3" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.3 Customer Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-4" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.4 Seller Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-5" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.5 Sales Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-6" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.6 Product Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-7" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.7 Review Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-8" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.8 Delivery Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-9" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.9 Payment Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-10" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.10 Geographical Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-11" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.11 Cohort Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-12" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.12 Correlation Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-13" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.13 Slice Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-14" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.14 Analysis of Customer Segments
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-15" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.15 RFM Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-16" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.16 Customer Clustering
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#4-17" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                4.17 Hypothesis Testing
            </a>
        </div>
</div>
    <!-- Level 1 -->
    <div style="font-size: 15px;
                font-weight: 600;
                margin: 8px 0 8px 0;">
        <a href="#5" 
           style="color: white;
                  text-decoration: none;
                  display: inline-block;
                  background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
                  padding: 2px 10px;
                  border-radius: 4px;"
            onmouseover="this.style.opacity='0.8'" 
            onmouseout="this.style.opacity='1'">
            5 General Conclusion
        </a>
    </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-1" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.1 Time Dynamics
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-2" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.2 Customers
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-3" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.3 Sales
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-4" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.4 Products
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-5" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.5 Reviews
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-6" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.6 Delivery
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-7" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.7 Payments
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-8" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.8 Sellers
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-9" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.9 Cohort Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-10" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.10 Correlation Analysis
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-11" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.11 Black Friday
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-12" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.12 Cancelled Orders
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-13" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.13 Hypothesis Testing Results
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#5-14" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                5.14 Detected Anomalies
            </a>
        </div>
</div>
    <!-- Level 1 -->
    <div style="font-size: 15px;
                font-weight: 600;
                margin: 8px 0 8px 0;">
        <a href="#6" 
           style="color: white;
                  text-decoration: none;
                  display: inline-block;
                  background: linear-gradient(135deg, #185a9d 0%, #43cea2 100%);
                  padding: 2px 10px;
                  border-radius: 4px;"
            onmouseover="this.style.opacity='0.8'" 
            onmouseout="this.style.opacity='1'">
            6 Recommendations
        </a>
    </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-1" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.1 Product market fit
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-2" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.2 Customer loyalty Enhancement
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-3" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.3 Customer Experience Improvement
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-4" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.4 Logistics and Delivery Optimization
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-5" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.5 Financial optimization
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-6" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.6 Average Order Value and Basket Optimization
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-7" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.7 Segment-Specific Strategies
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-8" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.8 Product and Assortment Management
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-9" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.9 Marketing and Promotion
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-10" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.10 Peak Season Preparedness
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-11" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.11 Analytics and Monitoring
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-12" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.12 A/B Testing
            </a>
        </div>
   </div>
    <!-- Level 2 -->
    <div style="margin-left: 20px;">
        <div style="font-size: 14px;
                    font-weight: 500;
                    margin: 6px 0;">
            <a href="#6-13" 
               style="color: #185a9d"
                onmouseover="this.style.opacity='0.8'" 
                onmouseout="this.style.opacity='1'">               
                6.13 Anomaly
            </a>
        </div>
   </div>
</div>


<h1 id="1"> 1 Importing Libraries</h1>

In [None]:
%%capture
!pip install --user --no-deps frameon
!pip install pingouin scikit-posthocs
!pip install --upgrade plotly kaleido
!echo "y" | plotly_get_chrome
import sys
sys.path.append('/root/.local/lib/python3.11/site-packages')
from typing import Dict, Optional, Union
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from functools import partial
from IPython.display import HTML, display
import geopandas as gpd
import frameon as fron
from frameon import FrameOn as fo
import json
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.figure_factory as ff
import statsmodels.api as sm
from urllib.request import urlopen
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format
pio.renderers.default = "jpg" 
pio.defaults.default_width = None 
pio.defaults.default_height = None  

<h1 id="2"> 2 Data Description and Exploration</h1>

<h2 id="2-1"> 2.1 Data Description</h2>

The dataset is an extensive collection of e-commerce data from Brazil. It was gathered by Olist, a company that provides an online sales platform. The dataset covers the period from 2016 to 2018 and includes information about purchases made on the Olist platform.

Olist operates as a marketplace, enabling small and medium-sized businesses to sell their products through various channels, including major platforms like Amazon and Mercado Livre. Integration with other marketplaces allows sellers to manage orders and inventory centrally, significantly expanding their reach and simplifying the sales process.

Products listed on Olist can automatically be offered for sale on other platforms, increasing visibility and potential sales. However, the dataset only includes data on sales made directly through the Olist platform. It is important to note that since Olist also sells its products through various marketplaces, buyers may prefer to make purchases on those platforms rather than on the Olist website.

Key notes about the data:

- The data is a random sample of all purchases that received customer reviews.
- Each product can be shipped by different sellers.

The dataset follows this schema:

<img src="https://raw.githubusercontent.com/PavelGrigoryevDS/olist-deep-dive/main/data/data_scheme.png" alt="" width="900">

**Orders:**

Field | Description
-|-
order_id | Order ID.
customer_id | Customer ID.
order_status | Order status.
order_purchase_timestamp | Date and time of purchase.
order_approved_at | Date and time when the payment was approved. The payment was successful, and the order was approved for processing.
order_delivered_carrier_date | Date and time when the order was handed over to the carrier.
order_delivered_customer_date | Date when the order was delivered to the customer.
order_estimated_delivery_date | Estimated delivery date (set before actual delivery begins).

- For analyzing purchases and user behavior, it is advisable to use the order_purchase_timestamp time.
- This is the time when the customer completes the purchase process, allowing precise tracking of when the purchase decision was made.
- Using this time is particularly important for correctly identifying purchase days (weekdays or weekends) and analyzing temporal dynamics. Choosing another time (e.g., payment approval time) may distort the data, as it does not reflect the moment when the user took action.
  
The difference between order_purchase_timestamp and order_approved_at:

- order_purchase_timestamp
    - Indicates the moment when the customer completes the purchase process. This means the order was placed and confirmed but does not necessarily mean it was created in the system. This is the moment when the customer clicks the "Buy" button and initiates the process. At this point, the order status is set to "created".
- order_approved_at
    - Indicates the moment when the order was approved after successful payment verification. This means the funds were confirmed, and the order is ready for further processing. At this point, the order status changes to "approved".
  
Order status can be one of the following:

- created
    - The customer visits the Olist platform and selects a product they want to buy. After adding the product to the cart and completing the checkout process, the order status is set to "created". This means the order was successfully created but has not yet been processed.
- approved
    - After the order is created, the system checks the payment information. If the payment is successful, the order status changes to "approved". This means the order is approved for further processing.   
- invoiced
    - At this stage, an invoice may be issued. The status changes to "invoiced", meaning the order cost information has been recorded, and the customer has been provided with an invoice. This status may not always appear, as not all orders require it.
- processing
    - After the order is approved, the seller begins processing it. The status changes to "processing", indicating the order is being prepared for shipment.
- shipped
    - Once the order is packed, it is handed over to the courier service for delivery. The status changes to "shipped", indicating the order has left the seller's warehouse and is on its way to the buyer.
- delivered
    - When the courier delivers the order to the customer, the status changes to "delivered". This means the customer has received their product, and the order fulfillment process is complete.
- unavailable
    - If the product becomes unavailable after the order is created (e.g., sold out), the status may change to "unavailable". This can happen during processing.
- canceled
    - If the customer decides to cancel the order at any stage, the status may change to "canceled".

**Customers:**

Field | Description
-|-
customer_id | ID assigned to each order in the dataset (each order has a unique customer_id).
customer_unique_id | Customer ID used to identify a specific customer in the system.
customer_zip_code_prefix | First five digits of the customer's postal code.
customer_city | Customer's city.
customer_state | State where the customer is located.

**Geolocation:**

Field | Description
-|-
geolocation_zip_code_prefix | First five digits of the postal code.
geolocation_lat | Latitude.
geolocation_lng | Longitude.
geolocation_city | City.
geolocation_state | State.

**Order_items:**

Field | Description
-|-
order_id | Order ID.
order_item_id | Order item ID.
product_id | Product ID.
seller_id | Seller ID.
shipping_limit_date | Date by which the seller must hand over the product to the logistics company.
price | Product price.
freight_value | Shipping cost (if the order includes multiple products, the shipping cost is divided among them).

**Order_payments:**

Field | Description
-|-
order_id | Order ID.
payment_sequential | Sequential number of the payment in the order.
payment_type | Payment type: credit and debit cards, voucher (coupon or certificate), boleto (electronic check).
payment_installments | Number of installments (if the payment is split into multiple parts).
payment_value | Payment amount.

Boleto is a document representing a payment invoice. It contains information about the amount to be paid and the recipient's details.

The customer selects this payment method, after which the store issues a boleto voucher to the customer. The customer must pay it by the specified deadline.

The customer can pay it via online banking, ATMs, or bank tellers. After payment, the bank processes the transaction and credits the amount to the issuer's account. The payment confirmation process typically takes 1 to 3 business days.

**Order_reviews:**

Field | Description
-|-
review_id | Review ID.
order_id | Order ID.
review_score | Review score (1 to 5).
review_comment_title | Review title.
review_comment_message | Review text.
review_creation_date | Date the review was created.
review_answer_time_daysstamp | Date the review was answered.

**Products:**

Field | Description
-|-
product_id | Product ID.
product_category_name | Product category.
product_name_lenght | Length of the product name.
product_description_lenght | Length of the product description.
product_photos_qty | Number of product photos.
product_weight_g | Product weight in grams.
product_length_cm | Product length in centimeters.
product_height_cm | Product height in centimeters.
product_width_cm | Product width in centimeters.

**Sellers:**

Field | Description
-|-
seller_id | Seller ID.
seller_zip_code_prefix | First five digits of the seller's postal code.
seller_city | Seller's city.
seller_state | Seller's state.

**Product_category_name:**

Field | Description
-|-
product_category_name | Product category in Portuguese.
product_category_name_english | Product category in English.

<h2 id="2-2"> 2.2 Data Loading</h2>

- Load the data.
- Assign data types to columns where possible. Columns with missing values cannot be immediately converted to integer type. Leave such columns as is for now.
- Display the first few rows of each dataframe.
- Examine column types.
- For categorical variables, perform normalization and convert to Title Case format for consistency and better visual presentation.
- For text variables, normalize and convert to lowercase to eliminate implicit duplicates.

**Table df_orders**

In [None]:
dtype = {'order_status': 'category'}
df_orders = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_orders_dataset.csv', dtype=dtype
                , parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'], date_format='%Y-%m-%d %H:%M:%S')
df_orders.rename(columns={'order_purchase_timestamp': 'order_purchase_dt'
                          , 'order_approved_at': 'order_approved_dt'
                          , 'order_delivered_carrier_date': 'order_delivered_carrier_dt'
                          , 'order_delivered_customer_date': 'order_delivered_customer_dt'
                          , 'order_estimated_delivery_date': 'order_estimated_delivery_dt'}, inplace=True)
df_orders = fo(df_orders)
df_orders.sample(5, random_state=7)

In [None]:
df_orders.dtypes

In [None]:
df_orders.order_status.preproc.normalize_string_series(inplace=True)

**Table df_payments**

In [None]:
dtype = {'payment_type': 'category'}
df_payments = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_order_payments_dataset.csv', dtype=dtype)
df_payments = fo(df_payments)
df_payments.sample(5, random_state=7)

In [None]:
df_payments.dtypes

In [None]:
df_payments.payment_type.preproc.normalize_string_series(inplace=True)

**Table df_items**

In [None]:
df_items = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_order_items_dataset.csv'
                , parse_dates=['shipping_limit_date'], date_format='%Y-%m-%d %H:%M:%S')
df_items.rename(columns={'shipping_limit_date': 'shipping_limit_dt'}, inplace=True)
df_items = fo(df_items)
df_items.sample(5, random_state=7)

In [None]:
df_items.dtypes

**Table df_customers**

In [None]:
dtype = {'customer_city': 'category', 'customer_state': 'category'}
df_customers = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_customers_dataset.csv', dtype=dtype)
df_customers = fo(df_customers)
df_customers.sample(5, random_state=7)

In [None]:
df_customers.dtypes

In [None]:
df_customers.customer_city.preproc.normalize_string_series(inplace=True)
df_customers.customer_state.preproc.normalize_string_series(case_format='upper', inplace=True)

**Table df_reviews**

In [None]:
df_reviews = pd.read_csv('/kaggle/input/olist-order-reviews-translated/olist_order_reviews_dataset_translated.csv'
                , parse_dates=['review_creation_date', 'review_answer_timestamp']
                , date_format={'review_creation_date': '%Y-%m-%d', 'review_answer_timestamp': '%Y-%m-%d %H:%M:%S'})
df_reviews.rename(columns={'review_creation_date': 'review_creation_dt'
                           , 'review_answer_timestamp': 'review_answer_dt'}, inplace=True)
df_reviews = fo(df_reviews)
df_reviews.sample(5, random_state=7)

In [None]:
df_reviews.dtypes

In [None]:
df_reviews.review_comment_title.preproc.normalize_string_series(case_format='lower', inplace=True)
df_reviews.review_comment_message.preproc.normalize_string_series(case_format='lower', inplace=True)

**Table df_products**

In [None]:
dtype = {'product_category_name': 'category'}
df_products = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_products_dataset.csv', dtype=dtype)
df_products = fo(df_products)
df_products.sample(5, random_state=7)

In [None]:
df_products.dtypes

We will not normalize the product_category_name column because we will replace it with an English version.

**Table df_categories**

In [None]:
dtype = {'product_category_name': 'category', 'product_category_name_english': 'category'}
df_categories = pd.read_csv('/kaggle/input/brazilian-ecommerce/product_category_name_translation.csv', dtype=dtype)
df_categories = fo(df_categories)
df_categories.sample(5, random_state=7)

In [None]:
df_categories.dtypes

We will perform normalization only for the English version of the column because we will be working with it exclusively.

In [None]:
df_categories.product_category_name_english.preproc.normalize_string_series(inplace=True)

**Table df_sellers**

In [None]:
dtype = {'seller_city': 'category', 'seller_state': 'category'}
df_sellers = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_sellers_dataset.csv', dtype=dtype)
df_sellers = fo(df_sellers)
df_sellers.sample(5, random_state=7)

In [None]:
df_sellers.dtypes

In [None]:
df_sellers.seller_city.preproc.normalize_string_series(inplace=True)
df_sellers.seller_state.preproc.normalize_string_series(case_format='upper', inplace=True)

**Table df_geolocations**

In [None]:
dtype = {'geolocation_city': 'category', 'geolocation_state': 'category'}
df_geolocations = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_geolocation_dataset.csv', dtype=dtype)
df_geolocations = fo(df_geolocations)
df_geolocations.sample(5, random_state=7)

In [None]:
df_geolocations.dtypes

In [None]:
df_geolocations.geolocation_city.preproc.normalize_string_series(inplace=True)
df_geolocations.geolocation_state.preproc.normalize_string_series(case_format='upper', inplace=True)

We will combine all dataframes into a class for easier further work.

In [None]:
class Dfs:
    def __init__(self):
        self.orders = df_orders
        self.items = df_items
        self.reviews = df_reviews
        self.products = df_products
        self.geolocations = df_geolocations
        self.sellers = df_sellers
        self.payments = df_payments
        self.customers = df_customers
        self.categories = df_categories

    def __iter__(self):
        return iter([
            ('orders', self.orders),
            ('items', self.items),
            ('reviews', self.reviews),
            ('products', self.products),
            ('geolocations', self.geolocations),
            ('sellers', self.sellers),
            ('payments', self.payments),
            ('customers', self.customers),
            ('categories', self.categories),
        ])
dfs = Dfs()

<h2 id="2-3"> 2.3 Data Exploration</h2>

### 2.3.1 Table df_orders

Let’s look at the information about the dataframe.

In [None]:
df_orders.explore.info()

#### 2.3.1.1 Initial Column Analysis

We will examine each column individually.

**order_id**

In [None]:
df_orders.order_id.explore.info(plot=False)

**customer_id**

In [None]:
df_orders.customer_id.explore.info(plot=False)

**order_status**

In [None]:
df_orders.order_status.explore.info()

**Key Observations:**  

- 97% of all orders were delivered

**order_purchase_dt**

In [None]:
df_orders.order_purchase_dt.explore.info()

**Key Observations:**  

- In order_purchase_dt missing 4% of months, 10% of weeks, 18% of days


**order_approved_dt**

In [None]:
df_orders.order_approved_dt.explore.info()

**Key Observations:**  

- In order_approved_dt 160 missing values (<1% of total rows)
- In order_approved_dt missing 4% of months, 11% of weeks, 15% of days

**order_delivered_carrier_dt**

In [None]:
df_orders.order_delivered_carrier_dt.explore.info()

**Key Observations:**  

- In order_delivered_carrier_dt 1.78k missing values (2% of total rows).
- In order_delivered_carrier_dt missing 2% of weeks, 22% of days.

**order_delivered_customer_dt**

In [None]:
df_orders.order_delivered_customer_dt.explore.info()

**Key Observations:**  

- In order_delivered_customer_dt 2.96k missing values (3% of total rows).
- In order_delivered_customer_dt missing 3% of weeks, 12% of days.

**order_estimated_delivery_dt**

In [None]:
df_orders.order_estimated_delivery_dt.explore.info(plot=False)

**Key Observations:**  

- In order_estimated_delivery_dt missing 4% of weeks, 41% of days.

#### 2.3.1.2 Adding Temporary Dimensions

To study anomalies across different dimensions, we will add temporary metrics.

We will prefix their names with 'tmp_' to indicate that these are temporary metrics to be removed later.

They are temporary because the data may change after preprocessing.

Therefore, the primary metrics will be created after preprocessing.

Let’s check the initial DataFrame size and save it to ensure no data is lost later.

In [None]:
print(df_orders.shape[0])
tmp_ids = df_orders.order_id

In [None]:
tmp_df_reviews = (
    df_reviews.groupby('order_id', as_index=False)
    .agg(tmp_avg_reviews_score = ('review_score', 'mean'))
)
tmp_df_reviews['tmp_avg_reviews_score'] = np.floor(tmp_df_reviews['tmp_avg_reviews_score']).astype(int).astype('category')

tmp_df_payments = (
    df_payments.groupby('order_id', as_index=False)
    .agg(tmp_payment_types = ('payment_type', lambda x: ', '.join(x.unique())))
)
tmp_df_items = (
    df_items.merge(df_products, on='product_id', how='left')
    .assign(product_category_name = lambda x: x['product_category_name'].cat.add_categories(['missed in df_products']))
    .fillna({'product_category_name': 'missed in df_products'})
    .groupby('order_id', as_index=False)
    .agg(tmp_product_categories = ('product_category_name', lambda x: ', '.join(x.unique())))
)

df_orders = (
    df_orders.merge(tmp_df_reviews, on='order_id', how='left')
    .merge(tmp_df_payments, on='order_id', how='left')
    .merge(tmp_df_items, on='order_id', how='left')
    .merge(df_customers[['customer_id', 'customer_state']], on='customer_id', how='left')
    .rename(columns={'customer_state': 'tmp_customer_state'})
)

df_orders['tmp_product_categories'] = df_orders['tmp_product_categories'].fillna('Missing in Items').astype('category')

df_orders['tmp_payment_types'] = df_orders['tmp_payment_types'].fillna('Missing in Pays').astype('category')

df_orders['tmp_order_purchase_month'] = df_orders['order_purchase_dt'].dt.month_name().fillna('Missing purchase dt').astype('category')

df_orders['tmp_order_purchase_weekday'] = df_orders['order_purchase_dt'].dt.day_name().fillna('Missing purchase dt').astype('category')

conditions = [
    df_orders['order_purchase_dt'].isna()                      
    , df_orders['order_purchase_dt'].dt.hour.between(4,11)
    , df_orders['order_purchase_dt'].dt.hour.between(12,16)
    , df_orders['order_purchase_dt'].dt.hour.between(17,22)
    , df_orders['order_purchase_dt'].dt.hour.isin([23, 0, 1, 2, 3])
]
choices = ['Missing purchase dt', 'Morning', 'Afternoon', 'Evening', 'Night']
df_orders['tmp_purchase_time_of_day'] = np.select(conditions, choices, default='Missing purchase dt')
df_orders['tmp_purchase_time_of_day'] = df_orders['tmp_purchase_time_of_day'].astype('category')

conditions = [
    df_orders['order_delivered_customer_dt'].isna() | df_orders['order_estimated_delivery_dt'].isna()
    , df_orders['order_delivered_customer_dt'] > df_orders['order_estimated_delivery_dt']            
    , df_orders['order_delivered_customer_dt'] <= df_orders['order_estimated_delivery_dt']                           
]
choices = ['Missing delivery dt', 'Delayed', 'Not Delayed']
df_orders['tmp_is_delayed'] = np.select(conditions, choices, default='Missing delivery dt')
df_orders['tmp_is_delayed'] = df_orders['tmp_is_delayed'].astype('category')

conditions = [
    df_orders['order_status'].isna(), 
    df_orders['order_status'] == 'Delivered',               
    df_orders['order_status'] != 'Delivered',  
]
choices = ['Missing Status', 'Delivered', 'Not Delivered']
df_orders['tmp_is_delivered'] = np.select(conditions, choices, default='Missing Status')
df_orders['tmp_is_delivered'] = df_orders['tmp_is_delivered'].astype('category')

del tmp_df_reviews, tmp_df_payments, tmp_df_items

Verified that nothing was lost.

In [None]:
df_orders.shape[0]

In [None]:
set(df_orders.order_id) == set(tmp_ids)

All good.

#### 2.3.1.3 Exploring Missing Values

Let’s examine which columns contain missing values.

In [None]:
df_orders.explore.anomalies_report(
    anomaly_type='missing'
    , width=600
)

**Key Observations:**
- Missing values in these columns likely belong to orders that did not reach a certain status.

We will analyze missing values in each column separately.

**Missing in order_approved_dt**

In [None]:
tmp_miss = df_orders[df_orders['order_approved_dt'].isna()]

Let’s examine missing values in payment approval time over time.
Time will be based on order creation time.

In [None]:
df_orders['order_approved_dt'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , anomaly_type='missing'
    , freq='W'
)

**Key Observations:**

- In February 2017 and August 2018, there was a spike in orders missing payment approval timestamps.

Let’s analyze by order status.

In [None]:
df_orders['order_approved_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='order_status'
)

**Key Observations:**

- Missing values in the "canceled" and "created" statuses are logical.
- However, 14 missing values in order_approved_dt for orders with "delivered" status are unusual.

Let’s examine these 14 delivered orders with missing order_approved_dt.

In [None]:
tmp_miss[lambda x: x.order_status == 'Delivered']

**Key Observations:**  

- All delivered orders with missing order_approved_dt used "boleto" as the payment method. This may be a characteristic of "boleto" usage.
- All these orders were placed in January and February 2017. There may have been a system issue where approval timestamps were not saved.

Let’s examine the 5 created orders that have missing values in the payment approval time.

Let’s look at 5 "created" orders with missing payment approval timestamps.

In [None]:
tmp_miss[lambda x: x.order_status == 'Created']

**Key Observations:**  

- Orders with "created" status and missing payment approval timestamps were placed long ago and never delivered. The data may not have been processed.

Let’s analyze by average order review score.

In [None]:
df_orders['order_approved_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**
- The difference in proportions is significantly higher for score 1. These orders were likely not delivered.

Let’s examine a word cloud from review messages.

In [None]:
tmp_miss = tmp_miss.merge(df_reviews, on='order_id', how='left')

In [None]:
tmp_miss.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_miss.analysis.sentiment('review_comment_message')

**Key Observations:**  

- The sentiment is not predominantly negative.

Let’s randomly sample 20 review comments.
We’ll repeat this several times.

In [None]:
messages = (
    tmp_miss['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**

- Based on review messages, many orders were not delivered, but a significant number were delivered.
- Therefore, missing payment approval timestamps cannot be assumed to indicate order cancellation.

Let’s analyze by payment type.

In [None]:
df_orders['order_approved_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='tmp_payment_types'
)

**Key Observations:**

- The proportion of "voucher" payments in missing values has increased significantly. This payment type is notably more frequent in missing values.
- The "voucher" payment type has a stronger correlation with missing payment approval timestamps. This is likely a characteristic of this payment method.

Let’s analyze by month.

In [None]:
df_orders['order_approved_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='tmp_order_purchase_month'
)

**Key Observations:**
- August has a noticeably higher proportion of missing values than other months. This is also visible in the graph above.

---

**Missing Values in order_delivered_carrier_dt**

In [None]:
tmp_miss = df_orders[df_orders['order_delivered_carrier_dt'].isna()]

Let’s examine the distribution of missing values in the carrier handover time.

In [None]:
df_orders['order_delivered_carrier_dt'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , anomaly_type='missing'
    , freq='W'
)

**Key Observations:**  

- In November 2017, there was a spike in orders missing carrier handover timestamps. This may be related to Black Friday.

Let’s analyze by order status.

In [None]:
df_orders['order_delivered_carrier_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='order_status'
)

**Key Observations:**
- There are 2 delivered orders with missing order_delivered_carrier_dt.
- All orders with "unavailable" status have missing order_delivered_carrier_dt.

Let’s examine these 2 delivered orders.

In [None]:
tmp_miss[lambda x: x.order_status == 'Delivered'].merge(df_payments, on='order_id', how='left')

**Key Observations:**  

- Both orders with missing order_delivered_carrier_dt were paid via credit card.

Let’s analyze by average review score.

In [None]:
df_orders['order_delivered_carrier_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**

- The difference in proportions is significantly higher for score 1. These orders were likely not delivered.
- Review score 1 has the strongest correlation with missing carrier handover timestamps. This suggests these orders were not delivered, and customers were highly dissatisfied.

Let’s examine a word cloud from review messages.

In [None]:
tmp_miss = tmp_miss.merge(df_reviews, on='order_id', how='left')

In [None]:
tmp_miss.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_miss.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative reviews outnumber positive ones, and the boxplot body lies mostly below 0.

Let’s randomly sample 20 review comments.
We’ll repeat this several times.

In [None]:
messages = (
    tmp_miss['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**

- Based on review messages, many orders were not delivered, but a significant number were delivered.
- Orders with missing carrier handover timestamps were more frequently undelivered compared to those with missing payment approval timestamps.
- Some products may have been out of stock, and sellers did not hand them over to carriers.
- However, since many orders were still delivered, missing values cannot be assumed to indicate order cancellation.

Let’s analyze by customer state.

In [None]:
df_orders['order_delivered_carrier_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , include_columns='tmp_customer_state'
)

**Key Observations:**
- The difference in proportions is slightly higher in São Paulo compared to other states.

---

**Missing Values in order_delivered_customer_dt**

In [None]:
tmp_miss = df_orders[df_orders['order_delivered_customer_dt'].isna()]

Let’s examine the distribution of missing values in customer delivery time.

In [None]:
df_orders['order_delivered_customer_dt'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , anomaly_type='missing'
    , freq='W'
)

**Key Observations:**  

- In November 2017, there was a spike in orders missing customer delivery timestamps.

Let’s analyze by order status.

In [None]:
df_orders['order_delivered_customer_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='order_status'
)

**Key Observations:**
- There are 8 orders with "delivered" status but missing delivery timestamps.

Let’s analyze by customer state.

In [None]:
df_orders['order_delivered_customer_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , include_columns='tmp_customer_state'
)

**Key Observations:**
- The difference in proportions is slightly higher in Rio de Janeiro.

Let’s examine these 8 delivered orders.

In [None]:
tmp_miss[lambda x: x.order_status == 'Delivered'].merge(df_payments, on='order_id', how='left')

**Key Observations:**  

- 7 out of 8 orders with missing order_delivered_customer_dt were paid via credit card, and 1 was paid via debit card.

Let’s analyze by average review score.

In [None]:
df_orders['order_delivered_customer_dt'].explore.anomalies_by_categories(
    anomaly_type='missing'
    , pct_diff_threshold=-100
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**
- The difference in proportions is significantly higher for score 1. These orders were likely not delivered.

Let’s examine a word cloud from review messages.

In [None]:
tmp_miss = tmp_miss.merge(df_reviews, on='order_id', how='left')

In [None]:
tmp_miss.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_miss.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative reviews outnumber positive ones, and the boxplot body lies mostly below 0.

Let’s randomly sample 20 review comments.  
We’ll repeat this several times.

In [None]:
messages = (
    tmp_miss['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**

- Based on review messages, some orders were not delivered, but this is less frequent than with missing payment approval or carrier handover timestamps.
- Many messages confirm order receipt. Thus, these orders cannot be assumed canceled.

In [None]:
del tmp_miss

#### 2.3.1.4 Anomalies in Order Status

We have many orders with statuses other than "delivered." This is unusual. Let's investigate this.

Let's examine by status.

In [None]:
df_orders.order_status.value_counts()  

Let’s look at missing values in the timestamps by order status.

In [None]:
columns = [
    "order_status",
    "order_purchase_dt",
    "order_approved_dt",
    "order_delivered_carrier_dt",
    "order_delivered_customer_dt",
    "order_estimated_delivery_dt",
]
(
    df_orders[columns].pivot_table(
        index='order_status',
        aggfunc=lambda x: x.isna().sum(),
        observed=True,
    )
    .reset_index()
    [columns]
)

Let’s look at the number of orders without the delivered status over time.

In [None]:
labels = dict(
    order_purchase_dt = 'Date',
    order_id = 'Number of Orders', 
    order_status = 'Order Status', 
)
df_orders[lambda x: x.order_status != 'Delivered'].viz.line(
    x='order_purchase_dt',
    y='order_id',
    color='order_status',
    agg_func='nunique',
    freq='ME',
    labels=labels,
    markers=True,
    title='Number of Orders without Delivered Status by Month and Order Status',   
)

**Key Observations:**  

- In March and April 2018, there was a sharp spike in orders stuck in the "shipped" status.
- In February and August 2018, there were spikes in the "canceled" status.
- In November 2017, there was a spike in the "unavailable" status. This month included Black Friday.

Let's examine each status separately.

**created**

Let’s look at the rows in the dataframe with orders that have the status ‘created’.

In [None]:
df_orders[lambda x: x.order_status == 'Created']

**Key Observations:**  

- One order has a rating of 5, while four orders have a rating of 1.
- The process stops after purchase, before payment approval.

Let’s look at the review messages.

In [None]:
messages = (
    df_orders[lambda x: x.order_status == 'Created']
    .merge(df_reviews, on='order_id', how='left')
    ['review_comment_message']
    .tolist()
)
display(messages)

**Key Observations:**  

- Based on review comments, these orders were not delivered.

---

**approved**

Let’s look at the rows.

In [None]:
df_orders[lambda x: x.order_status == 'Approved']

**Key Observations:**  

- One order received a rating of 1, the other a 4.
- The process stops after payment approval, before carrier handover.

Let’s look at the review messages.

In [None]:
messages = (
    df_orders[lambda x: x.order_status == 'Approved']
    .merge(df_reviews, on='order_id', how='left')
    ['review_comment_message']
    .tolist()
)
display(messages)

**Key Observations:**  

- No comments were left for these orders.

---

**processing**

Let’s look at orders with the status ‘processing’ by month.

In [None]:
df_orders['order_status'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=df_orders.order_status == 'Processing'
    , freq='ME'
)

Let’s look at the count of each order status.

In [None]:
tmp_anomal = df_orders[lambda x: x.order_status == 'Processing']

In [None]:
(
    tmp_anomal[['order_purchase_dt', 'order_approved_dt', 'order_delivered_carrier_dt', 'order_delivered_customer_dt', 'order_estimated_delivery_dt']]
    .count()    
    .to_frame('count')
)

**Key Observations:**  

- The process stops after payment approval, before carrier handover.

Let’s look at it broken down by the average order rating.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Processing'
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**  

- 86% of orders with "processing" status have a rating of 1.
- 6% of orders have a rating of 2.
- Customers are clearly dissatisfied.

Let’s look at it broken down by payment type.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Processing'
    , include_columns='tmp_payment_types'
)

**Key Observations:**
- The "boleto" payment type has a slightly higher proportion difference.

Let’s randomly sample 20 review comments.  
We’ll repeat this several times.

In [None]:
tmp_anomal = tmp_anomal.merge(df_reviews, on='order_id', how='left')

In [None]:
messages = (
    tmp_anomal['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**  

- Based on review messages, orders were not delivered.
- Some reviews mention items being out of stock.

Let’s examine a word cloud from review messages.

In [None]:
tmp_anomal.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Most words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_anomal.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative reviews significantly outnumber positive ones, and the boxplot lies in the negative zone.

---

**invoiced**

Let’s look at orders with the status ‘invoiced’ by month.

In [None]:
df_orders['order_status'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=df_orders.order_status == 'Invoiced'
    , freq='ME'
)

Let’s look at the count of each order status.

In [None]:
tmp_anomal = df_orders[lambda x: x.order_status == 'Invoiced']

In [None]:
(
    tmp_anomal[['order_purchase_dt', 'order_approved_dt', 'order_delivered_carrier_dt', 'order_delivered_customer_dt', 'order_estimated_delivery_dt']]
    .count()    
    .to_frame('count')
)

**Key Observations:**  

- The process stops after payment approval, before carrier handover.

Let’s look at it broken down by the average order rating.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Invoiced'
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**  

- 74% of orders with "invoiced" status have a rating of 1.
- 9% of orders have a rating of 2.
- Customers are clearly dissatisfied.

Let’s randomly sample 20 review comments.  
We’ll repeat this several times.

In [None]:
tmp_anomal = tmp_anomal.merge(df_reviews, on='order_id', how='left')

In [None]:
messages = (
    tmp_anomal['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**  

- Review messages indicate orders were not delivered.
- Some reviews mention items being out of stock.

Let’s examine a word cloud from review messages.

In [None]:
tmp_anomal.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_anomal.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative reviews significantly outnumber positive ones, and the boxplot mostly lies below 0.

---

**unavailable**

Let’s look at orders with the status ‘unavailable’ by month.

In [None]:
df_orders['order_status'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=df_orders.order_status == 'Unavailable'
    , freq='ME'
)

Let’s look at the count of each order status.

In [None]:
tmp_anomal = df_orders[lambda x: x.order_status == 'Unavailable']

In [None]:
( 
    tmp_anomal[['order_purchase_dt', 'order_approved_dt', 'order_delivered_carrier_dt', 'order_delivered_customer_dt', 'order_estimated_delivery_dt']]
    .count()    
    .to_frame('count') 
) 

**Key Observations:**  

- The process stops after payment approval, before carrier handover.

Let’s look by the customer’s state.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Unavailable'
    , pct_diff_threshold=1
    , include_columns='tmp_customer_state'
)

**Key Observations:**  

- The proportion of missing values in São Paulo is higher than in the full dataset.

Let’s look  by product category.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Unavailable'
    , pct_diff_threshold=0
    , include_columns='tmp_product_categories'
)

**Key Observations:**  

- 99% of orders lack a category, meaning they are not in the items table.

Let’s look at it broken down by payment type.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Unavailable'
    , pct_diff_threshold=0
    , include_columns='tmp_payment_types'
)

**Key Observations:**  

- The "boleto" payment type has a slightly higher proportion difference.

Let’s look at it broken down by the average order rating.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Unavailable'
    , pct_diff_threshold=0
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**  

- The difference in proportions is much higher for a rating of 1.
- 78% of orders with "unavailable" status have a rating of 1.
- 8% of orders have a rating of 2.
- Customers are clearly dissatisfied.

Let’s randomly sample 20 review comments.  
We’ll repeat this several times.

In [None]:
tmp_anomal = tmp_anomal.merge(df_reviews, on='order_id', how='left')

In [None]:
messages = (
    tmp_anomal['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**  

- Review messages indicate orders were not delivered.
- Some reviews mention items being out of stock.

Let’s examine a word cloud from review messages.

In [None]:
tmp_anomal.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_anomal.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative reviews outnumber positive ones, and the boxplot mostly lies below 0.

---

**canceled**

Let’s look at orders with the status ‘canceled’ by month.

In [None]:
df_orders['order_status'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=df_orders.order_status == 'Canceled'
    , freq='ME'
)

Order cancellation can occur at different stages, so there may be missing values at various points. 

Let’s look at the missing values.

In [None]:
tmp_anomal = df_orders[lambda x: x.order_status == 'Canceled']

In [None]:
tmp_anomal.explore.detect_anomalies()

**Conversion at different stages**

Let’s look at the count of different order status timestamps. 

Let’s check if there are any missing values between the dates.

In [None]:
mask = tmp_anomal['order_delivered_carrier_dt'].isna() & tmp_anomal['order_delivered_customer_dt'].notna()
tmp_anomal.loc[mask, 'order_delivered_carrier_dt'] 

In [None]:
mask = tmp_anomal['order_approved_dt'].isna() & tmp_anomal['order_delivered_carrier_dt'].notna()
tmp_anomal.loc[mask, 'order_approved_dt']

All good.

In [None]:
tmp_funnel = ( 
    tmp_anomal[['order_purchase_dt', 'order_approved_dt', 'order_delivered_carrier_dt', 'order_delivered_customer_dt']]
    .count()    
    .to_frame('count') 
    .assign(share = lambda x: (x['count']*100 / x['count']['order_purchase_dt']).round(1).astype(str) + '%')
    .reset_index(names='stage')
) 

In [None]:
px.funnel(
    tmp_funnel, 
    x='count', 
    y='stage', 
    text='share',
    width=600,
    title='Conversion of Different Order Stages with "Canceled" Status'
)

**Key Observations:**  

- The process stops at different stages, most often between payment approval and carrier handover.

Let’s look at the conversion at each stage by month. 

For this, we will count the number of canceled orders with specific timestamps in each period and divide by the number of canceled orders at the time of purchase.

In [None]:
tmp_res_df = (
    tmp_anomal.resample('ME', on='order_purchase_dt')
    .agg(
        purchase = ('order_id', 'count')
        , approved = ('order_approved_dt', 'count')
        , delivered_carrier = ('order_delivered_carrier_dt', 'count')
        , delivered_customer = ('order_delivered_customer_dt', 'count')
    )
) 
tmp_res_df = tmp_res_df.div(tmp_res_df['purchase'], axis=0)
tmp_res_df = (  
    tmp_res_df.reset_index(names='date')
    .melt(id_vars='date', var_name='date_type', value_name='count')
)

Let’s look at the non-normalized values. That is, divide each value (count with a specific timestamp) by the total value for the period.

In [None]:
labels = dict(
    date = 'Date',
    date_type = 'Date Type',
    count = 'Conversion'
)
tmp_res_df.viz.line(
    x='date'
    , y='count'
    , color='date_type'
    , labels=labels
    , title='Conversion of Different Order Stages with "Canceled" Status by Month'
)

**Key Observations:**  

- Canceled orders almost never have delivery timestamps, which is logical.
- From December 2017 to March 2018, there was a significant spike in canceled orders that had carrier handover timestamps but no delivery timestamps, indicating delivery issues during this period.
- About 80% of canceled orders have payment approval timestamps, but this proportion increased significantly starting January 2018, approaching 100%.

**Number of Last Stages**

Let’s look at the last stage to which orders with the status ‘canceled’ reach over time. 

For this:
- transform the wide table into a long one, making the name of the time variable a category;
- remove missing values in the time (this will be the variable with the value after melt);
- convert these categories into a categorical type in pandas and specify the order;
- group by order;
- take the first time in each group (all entries in the group will have the same time);
- take the maximum stage (since we specified the order, this will be the last stage of the order).

In [None]:
tmp_df_orders_canceled = df_orders[lambda x: x.order_status == 'Canceled']
tmp_df_orders_canceled['tmp_date'] = tmp_df_orders_canceled['order_purchase_dt']

In [None]:
tmp_df_orders_canceled = (
    tmp_df_orders_canceled.rename(
        columns={
            'order_purchase_dt': 'purchase'
            , 'order_approved_dt': 'approved'
            , 'order_delivered_carrier_dt': 'delivered_carrier'
            , 'order_delivered_customer_dt': 'delivered_customer'
        }
    )
    .melt(
        id_vars=['tmp_date', 'order_id']
        , value_vars=['purchase', 'approved', 'delivered_carrier', 'delivered_customer']
        , var_name='date_stage'
    )
    .dropna(subset='value')
    .drop('value', axis=1)
)

In [None]:
date_stage_order = ['purchase', 'approved', 'delivered_carrier', 'delivered_customer']
tmp_df_orders_canceled['date_stage'] = (
    tmp_df_orders_canceled['date_stage']
    .astype('category')
    .cat.reorder_categories(date_stage_order, ordered=True)
)

In [None]:
tmp_df_orders_canceled = (
    tmp_df_orders_canceled.groupby('order_id', as_index=False)
    .agg(
        tmp_date = ('tmp_date', 'first')
        , last_date_stage = ('date_stage', 'max')
    )
)

Let’s look at it over time.

In [None]:
labels = dict(
    date = 'Date',
    order_id = 'Number of Orders',
    last_date_stage = 'Last Stage'
)
tmp_df_orders_canceled.viz.line(
    x='tmp_date'
    , y='order_id'
    , color='last_date_stage'
    , agg_func='nunique'
    , freq='ME'
    , labels=labels
    , markers=True
    , title='Number of Orders by Month and Last Stage'
)

**Key Observations:**  

- In most months, the process stops after payment approval.
- From December 2017 to March 2018, there was a spike in orders that stopped after carrier handover.
- In August 2018, there was a sharp peak in orders that stopped immediately after purchase.

Let’s look by the customer’s state.

In [None]:
tmp_anomal = df_orders[lambda x: x.order_status == 'Canceled']

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
   custom_mask=df_orders.order_status == 'Canceled'
   , pct_diff_threshold=0
    , include_columns='tmp_customer_state'
)

**Key Observations:**  

- The proportion of missing values in São Paulo is significantly higher than in the full dataset.

Let’s look  by product category.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
   custom_mask=df_orders.order_status == 'Canceled'
   , pct_diff_threshold=1
    , include_columns='tmp_product_categories'
)

**Key Observations:**  

- Missing product categories have a much higher proportion difference, possibly due to items being out of stock.

Let’s look at it broken down by payment type.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
   custom_mask=df_orders.order_status == 'Canceled'
   , pct_diff_threshold=0
    , include_columns='tmp_payment_types'
)

**Key Observations:**  

- The "voucher" payment type has a noticeably higher proportion difference.

Let’s look at it broken down by the average order rating.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
   custom_mask=df_orders.order_status == 'Canceled'
   , pct_diff_threshold=0
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**  

- 69% of orders with "canceled" status have a rating of 1.
- 7% of orders have a rating of 2.
- Customers are clearly dissatisfied.

Let’s randomly sample 20 review comments.  
We’ll repeat this several times.

In [None]:
tmp_anomal = tmp_anomal.merge(df_reviews, on='order_id', how='left')

In [None]:
messages = (
    tmp_anomal['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**  

- Review messages indicate orders were not delivered.
- Some reviews mention items being out of stock.

Let’s examine a word cloud from review messages.

In [None]:
tmp_anomal.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_anomal.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative messages significantly outnumber positive ones, and the boxplot lies below 0.

---

**shipped**

Let’s look at the number of orders with the status ‘delivered’ by month.

In [None]:
df_orders['order_status'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=df_orders.order_status == 'Shipped'
    , freq='ME'
)

Let’s look at the count of each order status.

In [None]:
tmp_anomal = df_orders[lambda x: x.order_status == 'Shipped']

In [None]:
(
    tmp_anomal[['order_purchase_dt', 'order_approved_dt', 'order_delivered_carrier_dt', 'order_delivered_customer_dt', 'order_estimated_delivery_dt']]
    .count()    
    .to_frame('count')
)

**Key Observations:**  

- The process stops after carrier handover, before customer delivery.

Let’s look by the customer’s state.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Shipped'
    , pct_diff_threshold=1
    , include_columns='tmp_customer_state'
)

**Key Observations:**  

- The proportion of missing values in Rio de Janeiro is significantly higher than in the full dataset.

Let’s look at it broken down by the average order rating.

In [None]:
df_orders['order_status'].explore.anomalies_by_categories(
    custom_mask=df_orders.order_status == 'Shipped'
    , pct_diff_threshold=1
    , include_columns='tmp_avg_reviews_score'
)

**Key Observations:**  

- 62% of orders with "shipped" status have a rating of 1.
- 8% of orders have a rating of 2.
- Customers are clearly dissatisfied.

Let’s randomly sample 20 review comments.  
We’ll repeat this several times.

In [None]:
tmp_anomal = tmp_anomal.merge(df_reviews, on='order_id', how='left')

In [None]:
messages = (
    tmp_anomal['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**  

- Review messages indicate most orders were not delivered.

Let’s examine a word cloud from review messages.

In [None]:
tmp_anomal.viz.wordcloud('review_comment_message')

**Key Observations:**  

- Many words relate to delivery.

Let’s analyze the sentiment of the text.

In [None]:
tmp_anomal.analysis.sentiment('review_comment_message')

**Key Observations:**  

- Negative messages outnumber positive ones, and the boxplot mostly lies below 0.

#### 2.3.1.5 Status and Delivery Mismatches

**Delivery status missing but delivery timestamp present**

Let's check if there are orders without "delivered" status that still have a delivery timestamp.

In [None]:
df_orders[lambda x: (x.order_status != 'Delivered') & ~x.order_delivered_customer_dt.isna()]

**Key Observations:**  

- There are orders without "delivered" status that have delivery timestamps. Most likely these orders were canceled after delivery.

Let's examine their reviews

In [None]:
messages = (
    df_orders[lambda x: (x.order_status != 'Delivered') & ~x.order_delivered_customer_dt.isna()]
    .merge(df_reviews, on='order_id', how='left')
    ['review_comment_message']
    .dropna()
    .tolist()
)
display(messages)

**Key Observations:**

- Review messages indicate these orders were not delivered.

---

**Status is "delivered" but delivery timestamp is missing**

Let's check if there are orders with "delivered" status but missing delivery timestamps.

In [None]:
df_orders[lambda x: x.order_status.isin(['Delivered']) & x.order_delivered_customer_dt.isna()]

**Key Observations:**  

- The dataset contains 8 orders with "delivered" status but missing delivery timestamps.

Let's examine their reviews

In [None]:
messages = (
    df_orders[lambda x: x.order_status.isin(['Delivered']) & x.order_delivered_customer_dt.isna()]
    .merge(df_reviews, on='order_id', how='left')
    ['review_comment_message']
    .dropna()
    .tolist()
)
display(messages)

**Key Observations:**  

- Review messages suggest the products were actually delivered.

---

**Order canceled or unavailable but has delivery timestamp**

Let's check if there are orders with "canceled" or "unavailable" status that still have delivery timestamps.

In [None]:
df_orders[lambda x: x.order_status.isin(['Canceled', 'Unavailable']) & ~x.order_delivered_customer_dt.isna()]

**Key Observations:**  

- The dataset contains 6 orders with "canceled" status that have customer delivery timestamps

Let's examine their reviews

In [None]:
messages = (
    df_orders[lambda x: x.order_status.isin(['Canceled', 'Unavailable']) & ~x.order_delivered_customer_dt.isna()]
    .merge(df_reviews, on='order_id', how='left')
    ['review_comment_message']
    .dropna()
    .tolist()
)
display(messages)

**Key Observations:**  

- Review messages indicate some items were delivered while others were not.

#### 2.3.1.6 Date Inconsistencies

**order_purchase_dt**

Let's check if there are timestamps earlier than purchase dates.

In [None]:
for col_dt in ['order_approved_dt', 'order_delivered_carrier_dt', 'order_delivered_customer_dt']:
    rows_cnt = df_orders[~(df_orders['order_purchase_dt'].isna() | df_orders[col_dt].isna()
                          | (df_orders['order_purchase_dt'] <= df_orders[col_dt]))].shape[0]
    if rows_cnt:
        print(f'{col_dt} < order_purchase_dt, rows count: {rows_cnt}')

**Key Observations:**  

- There are 166 orders where carrier handover time is earlier than purchase time. This is unusual.

Let's examine the dataframe

In [None]:
tmp_mask = ~(df_orders['order_purchase_dt'].isna() | df_orders['order_delivered_carrier_dt'].isna()
                          | (df_orders['order_purchase_dt'] <= df_orders['order_delivered_carrier_dt']))
tmp_df_orders = df_orders[tmp_mask]
print(f'rows: {tmp_df_orders.shape[0]}')
display(tmp_df_orders.head(5))

Let's analyze by day

In [None]:
tmp_df_orders.explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=tmp_mask
    , freq='D'
)

**Key Observations:**
- These anomalies only occurred between 25 April and 24 August 2018.

Let's analyze by order status

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='order_status'
)

Let’s look at it broken down by payment type.

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='tmp_payment_types'
)

**Key Observations:**
- Over 90% of anomalous orders were paid by credit card.

Let's analyze by time of day

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='tmp_purchase_time_of_day'
)

**Key Observations:**
- Most anomalies occurred in the afternoon.

Let's examine their reviews

In [None]:
messages = (
    tmp_df_orders.merge(df_reviews, on='order_id', how='left')
    ['review_comment_message']
    .dropna()
    .sample(20)
    .tolist()
)
display(messages)

**Key Observations:**  

- Nothing unusual found.

---

**order_approved_dt**

Let's check if there are timestamps that should occur after approval but appear earlier.

In [None]:
for col_dt in ['order_delivered_carrier_dt', 'order_delivered_customer_dt']:
    rows_cnt = df_orders[~(df_orders['order_approved_dt'].isna() | df_orders[col_dt].isna()
                          | (df_orders['order_approved_dt'] <= df_orders[col_dt]))].shape[0]
    if rows_cnt:
        print(f'{col_dt} < order_approved_dt, rows count: {rows_cnt}')

**Key Observations:**  

- There are 1,359 orders where carrier handover time is earlier than payment approval time.
- There are 61 orders where delivery time is earlier than payment approval time.

Let’s examine each one separately.

**order_delivered_carrier_dt < order_approved_dt**

Let's examine the dataframe

In [None]:
tmp_mask = ~(df_orders['order_approved_dt'].isna() | df_orders['order_delivered_carrier_dt'].isna()
                          | (df_orders['order_approved_dt'] <= df_orders['order_delivered_carrier_dt']))
tmp_df_orders = df_orders[tmp_mask]
print(f'rows: {tmp_df_orders.shape[0]}')
display(tmp_df_orders.head(5))

Let's examine by days

In [None]:
tmp_df_orders.explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=tmp_mask
    , freq='D'
)

**Key Observations:**
- Days with most anomalies:
    - 19-23 April 2018
    - 3-4 July 2018
- Possible system issues caused delayed payment approvals.

Let's analyze by order status

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='order_status'
)

**Key Observations:**
- Nearly all orders were eventually delivered.

Let's analyze by time of day

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='tmp_purchase_time_of_day'
)

**Key Observations:**
- More anomalies occurred in the afternoon.

**order_delivered_customer_dt < order_approved_dt**

In [None]:
tmp_mask = ~(df_orders['order_approved_dt'].isna() | df_orders['order_delivered_customer_dt'].isna()
                          | (df_orders['order_approved_dt'] <= df_orders['order_delivered_customer_dt']))
tmp_df_orders = df_orders[tmp_mask]

Let's examine by days

In [None]:
tmp_df_orders.explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , custom_mask=tmp_mask
    , freq='D'
)

**Key Observations:**
- Anomalies occurred sporadically on specific dates.

Let’s look at it broken down by payment type.

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='tmp_payment_types'
)

**Key Observations:**
- "Boleto" payments had significantly more anomalies.

Let's analyze by customer state

In [None]:
df_orders.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='tmp_customer_state'
)

**Key Observations:**
- Most anomalies occurred in São Paulo.

**order_delivered_carrier_dt**

Let's check if there are timestamps that should occur after carrier handover but appear earlier.

In [None]:
tmp_mask = ~(df_orders['order_delivered_carrier_dt'].isna() | df_orders['order_delivered_customer_dt'].isna()
                        | (df_orders['order_delivered_carrier_dt'] <= df_orders['order_delivered_customer_dt']))
rows_cnt = df_orders[tmp_mask].shape[0]
if rows_cnt:
    print(f'order_delivered_customer_dt < order_delivered_carrier_dt, rows count: {rows_cnt}')

**Key Observations:**  

- There are 23 orders where delivery time is earlier than carrier handover time.

**review_creation_dt < order_purchase_dt**

We have order creation time and review creation time. Let's check if any reviews were created before their corresponding orders.

In [None]:
temp_df = df_orders.merge(df_reviews, on='order_id', how='left')
temp_df = temp_df[lambda x: x.order_purchase_dt.dt.date > x.review_creation_dt]
temp_df.shape[0]

The dataset contains 65 orders where reviews were created before the orders themselves.

Let's examine them

In [None]:
temp_df.head()

Let’s look at how many orders do not have an approval payment date.

In [None]:
temp_df.order_approved_dt.isna().sum()

Let’s look at how many orders do not have a delivery date.

In [None]:
temp_df.order_approved_dt.isna().sum()

Let’s look at how many of them were canceled.

In [None]:
temp_df.order_status.value_counts() 

**Key Observations:**  

- The dataset contains 65 orders where reviews were created before the orders themselves. 58 orders were canceled. 6 were delivered. 1 was in delivery process.

Let's examine the 6 delivered orders

In [None]:
temp_df[temp_df.order_status=='Delivered']

We previously determined that one order can have multiple reviews and one review can cover multiple orders.

Let's check for duplicates in these orders and reviews.

In [None]:
temp_unque_orders = temp_df.order_id.unique()
temp_unque_reviews = temp_df.review_id.unique()

In [None]:
df_reviews[df_reviews.review_id.isin(temp_unque_reviews)].merge(df_orders, on='order_id', how='left').sort_values('review_id').head()

Even accounting for duplicates, both orders show review creation dates preceding order dates.

Let's check if any review responses were created before the reviews themselves:

In [None]:
df_reviews[lambda x: x.review_creation_dt >=x.review_answer_dt]

No such cases found.

### 2.3.2 Table df_payments

Let’s look at the information about the dataframe.

In [None]:
df_payments.explore.info()

#### 2.3.2.1 Initial Column Analysis

We will examine each column individually.

**order_id**

In [None]:
df_payments['order_id'].explore.info(plot=False)

**Key Observations:**  

- All is well.


**payment_sequential**

In [None]:
df_payments['payment_sequential'].explore.info(plot=False)

**Key Observations:**  

- The maximum number of payment methods for a single order is 29.

**payment_type**

In [None]:
df_payments['payment_type'].explore.info(plot=True)

**Key Observations:**  

- 74% of payments were made using credit cards.
- The payment_type field contains undefined payment types (<1%).

**payment_installments**

In [None]:
df_payments['payment_installments'].explore.info()

**Key Observations:**  

- The maximum number of installments for a product payment is 24.
- The median number of payment installments is 1.
- 75% of orders have installment plans with 4 or fewer payments.
- There are 2 orders with a value of 0 in payment_installments.

**payment_value**

In [None]:
df_payments['payment_value'].explore.info()

**Key Observations:**  

- There are 9 zero-value payments in payment_value.
- The maximum payment is 13.66k. The median payment is 100.
- The 13.66k payment is clearly an outlier.

#### 2.3.2.2 Exploring Outliers

In [None]:
df_payments.explore.anomalies_report(
    anomaly_type='outlier'
)

Let's examine payments exceeding 5,000.

In [None]:
df_payments[df_payments.payment_value > 5_000]

Let's check for outliers in total order amounts per user.

In [None]:
(
    df_customers.merge(df_orders, on='customer_id', how='left')
    .merge(df_payments, on='order_id', how='left')
    .groupby('customer_unique_id')['payment_value']
    .sum()
    .sort_values(ascending=False)
    .to_frame()
    .head(10)
)


**Key Observations:**  

- One user made orders totaling 13,664. This clearly stands out from the rest.
- There are also several users who made purchases totaling 6,000 or more.

Let's identify outliers using quantiles.

We'll consider values outside the 5th and 95th percentiles as outliers.

In [None]:
df_payments.explore.detect_anomalies(
    anomaly_type='outlier'
    , method='quantile'
    , threshold=0.05
)

**Key Observations:**  

- 10% of payment values are outliers. This exceeds the typical norm (5%) but isn't critical.
- For payment installments, outliers account for less than 1%, which is normal.

**payment_value**

Let's examine the distribution of payment value outliers over time.

In [None]:
tmp_outl = df_payments.merge(df_orders, on='order_id', how='left')

In [None]:
tmp_outl['payment_value'].explore.anomalies_over_time(
    time_column='order_purchase_dt'
    , anomaly_type='outlier'
    , freq='D'
)

**Key Observations:**  

- Many payment outliers occurred between November 20-26, 2017, likely related to Black Friday.

In [None]:
del tmp_outl

#### 2.3.2.3 Exploring Other Anomalies

Let's explore zero values.

In [None]:
df_payments.explore.anomalies_report(
    anomaly_type='zero'
    , sample_size=20
)

**Key Observations:**  

- Orders with zero payment amounts have either "voucher" or "not_defined" as their payment type.

Let's examine zeros in each column separately.

**Zeros in payment_installments**

In [None]:
df_payments[df_payments.payment_installments == 0]

Since payment_sequential shows 2, there should have been another payment. Let's examine these orders.

In [None]:
df_payments[df_payments.order_id == '744bade1fcf9ff3f31d860ace076d422']

In [None]:
df_payments[df_payments.order_id == '1a57108394169c0b47d8f876acc9ba2d']

Let's check these orders in df_items.

In [None]:
df_items[df_items.order_id == '744bade1fcf9ff3f31d860ace076d422']

In [None]:
df_items[df_items.order_id == '1a57108394169c0b47d8f876acc9ba2d']

As we can see, the order wasn't fully recorded in df_payments. The first payment is missing.

---

**Zeros in payment_value**

In [None]:
df_payments[df_payments.payment_value == 0]

Let's look at other payments for order fa65dad1b0e818e3ccc5cb0e39231352.

In [None]:
df_payments[df_payments.order_id == '8bcbe01d44d147f901cd3192671144db']

**Key Observations:**  

- One payment was processed as zero, and it was the last payment.- 
- Moreover, all zero payments have either "voucher" or "not_defined" as their type.- 
- There might be some specific payment logic here.- 
- It's better not to modify these zeros.

### 2.3.3 Table df_items

Let’s look at the information about the dataframe.

In [None]:
df_items.explore.info()

#### 2.3.3.1 Initial Column Analysis

We will examine each column individually.

**order_id**

In [None]:
df_items['order_id'].explore.info(plot=False)

**order_item_id**

In [None]:
df_items['order_item_id'].explore.info()

**Key Observations:**  

- The maximum quantity of items in a single order is 21.


**product_id**

In [None]:
df_items['product_id'].explore.info(plot=False)

**seller_id**

In [None]:
df_items['seller_id'].explore.info(plot=False)

**shipping_limit_dt**

In [None]:
df_items['shipping_limit_dt'].explore.info()

**Key Observations:**  

- In shipping_limit_dt: 20% missing years, 41% missing months, 47% missing weeks, 57% missing days.
- The maximum date in shipping_limit_dt is 2020-04-09.


**price**

In [None]:
df_items['price'].explore.info()

**Key Observations:**  

- Most products are priced between 39.9 and 134.9.
- The median product price is 74.99.

**freight_value**

In [None]:
df_items['freight_value'].explore.info()

**Key Observations:**  

- There are zero values in freight_value.

#### 2.3.3.2 Exploring Outliers

In [None]:
df_items.explore.anomalies_report(
    anomaly_type='outlier'
    , exclude_columns='seller_id'
)

**Key Observations:**  

- About 10% outliers exist in product prices and shipping costs. This exceeds the typical norm (usually 5%) but isn't critical.

#### 2.3.3.3 Product Sales Inconsistencies

Checking if any products in the items table have multiple sellers:

In [None]:
df_items.groupby('product_id')['seller_id'].nunique().sort_values(ascending=False).head(10).to_frame('sellers_cnt')

**Key Observations:**  

- Some product IDs were sold by different sellers.

In [None]:
df_items.groupby('product_id')['seller_id'].nunique().value_counts().to_frame('products_cnt')

**Key Observations:**  

- Over 1,000 products have more than 2 sellers.

Examining product d285360f29ac7fd97640bf0baef03de0

In [None]:
df_products[lambda x: x.product_id == 'd285360f29ac7fd97640bf0baef03de0']

In [None]:
tmp_df_res = (df_items[lambda x: x.product_id == 'd285360f29ac7fd97640bf0baef03de0']
        [['shipping_limit_dt', 'price', 'freight_value', 'seller_id']]
        .merge(df_sellers, on='seller_id', how='left')
)
tmp_df_res.seller_id.unique()

In [None]:
tmp_df_res.seller_state.unique()

In [None]:
tmp_df_res.seller_city.unique()

**Key Observations:**  

- Sellers are located in different cities.
- This might not be an anomaly - different sellers could legitimately sell identical products with matching IDs.

Checking if any products were sold across different seller states

In [None]:
(df_items.merge(df_sellers, on='seller_id', how='left')
 .groupby('product_id')['seller_state'].nunique().sort_values(ascending=False).head(10).to_frame('states_cnt')
)

**Key Observations:**  

- Some products were sold by sellers in different states.

Verifying city consistency for customer_id in the customers table (as this is our join key)

In [None]:
(df_customers.groupby('customer_id')[['customer_state', 'customer_city']].nunique() > 1).sum()

All is well..

#### 2.3.3.4 Date Inconsistencies

**shipping_limit_dt**

Analyzing anomalous shipping_limit_dt values

In [None]:
df_items[df_items.shipping_limit_dt > '2018-12-31'].merge(df_orders, on='order_id', how='left')

**Key Observations:**  

- Found 4 orders with abnormally large shipping_limit_dt values, despite having normal estimated delivery times.

#### 2.3.3.5 Exploring Other Anomalies

Examining zero values

In [None]:
tmp_zeros = df_items.explore.detect_anomalies(
    anomaly_type='zero'
    , return_mode='by_column'
)['freight_value']

**Key Observations:**  

- Zero freight values may indicate free shipping.

Examing rows

In [None]:
tmp_zeros.sample(5)

Reviewing zero-value over time

In [None]:
df_items.freight_value.explore.anomalies_over_time(
    time_column='shipping_limit_dt'
    , anomaly_type='zero'
    , freq='W'
)

**Key Observations:**  

- Most zero shipping costs occurred between April-July 2018.

### 2.3.4 Table df_customers

Let’s look at the information about the dataframe.

In [None]:
df_customers.explore.info()

#### 2.3.4.1 Initial Column Analysis

We will examine each column individually.

**customer_id**

In [None]:
df_customers['customer_id'].explore.info(plot=False)

**customer_unique_id**

In [None]:
df_customers['customer_unique_id'].explore.info(plot=False)

**Key Observations:**  

- customer_unique_id has 3% duplicates - acceptable as this field doesn't require uniqueness in this table.


**customer_zip_code_prefix**

In [None]:
df_customers['customer_zip_code_prefix'].explore.info(plot=False)

**customer_city**

In [None]:
df_customers['customer_city'].explore.info()

**Key Observations:**  

- Most customers are from São Paulo city (16%).


**customer_state**

In [None]:
df_customers['customer_state'].explore.info()

**Key Observations:**  

- Most customers are from SP state (42%).


### 2.3.5 Table df_reviews

Let’s look at the information about the dataframe.

In [None]:
df_reviews.explore.info()

#### 2.3.5.1 Initial Column Analysis

We will examine each column individually.

**review_id**

In [None]:
df_reviews['review_id'].explore.info(plot=False)

**Key Observations:**  

- review_id contains 827 duplicates.


**order_id**

In [None]:
df_reviews['order_id'].explore.info(plot=False)

**Key Observations:**  

- order_reviews table has 559 duplicate order_ids.


**review_score**

In [None]:
df_reviews['review_score'].explore.info(column_type='categorical')

**Key Observations:**  

- Over half of reviews (57%) give maximum 5-star ratings.

**review_comment_title**

In [None]:
df_reviews['review_comment_title'].explore.info(column_type='text')

**Key Observations:**  

- 88% of review titles are missing.
- Most common review title (8%) is 'recomendo'.

**review_comment_message**

In [None]:
df_reviews['review_comment_message'].explore.info() 

**Key Observations:**  

- 58% of orders lack review messages.
- Only 36% of review comments are unique.
- Most frequent comment (1%) contains "muito bom".

**review_creation_dt**

In [None]:
df_reviews['review_creation_dt'].explore.info()

**Key Observations:**  

- review_creation_dt has 9% missing days.

**review_answer_dt**

In [None]:
df_reviews['review_answer_dt'].explore.info()

**Key Observations:**  

- review_answer_dt has 5% missing days.

#### 2.3.5.2 Exploring Missing Values

Checking columns with missing values:

In [None]:
df_reviews.explore.anomalies_report(
    anomaly_type='missing'
    , pct_diff_threshold=10
    , show_by_categories=False
    , show_sample=False
    , width=600
)

**Key Observations:**  

- Missing review titles/messages aren't anomalies - they were simply not provided.

#### 2.3.5.3 Exploring Duplicates

Examining duplicates in order_id and review_id:

In [None]:
df_reviews[['order_id', 'review_id']].duplicated().sum()

No instances where both order_id and review_id are duplicated simultaneously.

Theoretical possibility: one order could have multiple reviews, but multiple orders sharing one review is unusual.

Analyzing order_id and review_id duplicates separately

**review_id**

In [None]:
tmp_dupl = df_reviews[df_reviews.review_id.duplicated()]

Reviewing review_id duplicate distribution over time

In [None]:
df_reviews.review_id.explore.anomalies_over_time(
    time_column='review_creation_dt'
    , anomaly_type='duplicate'
    , freq='W'
)

**Key Observations:**  

- March 2018 saw a significant spike in duplicate review_ids (one review applied to multiple orders).

Checking for duplicates with different customers

In [None]:
tmp_dupl = (
    tmp_dupl.merge(df_orders, on='order_id', how='left')
    .merge(df_customers, on='customer_id', how='left')
)
tmp_dupl.groupby('review_id')['customer_unique_id'].nunique().value_counts()

**Key Observations:**  

- No duplicates with different customers.

Examining product quantities in these orders

In [None]:
(
    tmp_dupl.merge(df_items, on='order_id', how='left')
    .groupby('order_id')['product_id']
    .nunique()
    .value_counts()
)

**Key Observations:**  

- Most orders contain one product.
- Some orders show no products (due to missing records in the items table).

In [None]:
tmp_dupl = (tmp_dupl.merge(df_payments, on='order_id', how='left')
            .merge(df_items, on='order_id', how='left')
)

In [None]:
tmp_dupl = tmp_dupl[['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message'
                   , 'review_creation_dt', 'order_delivered_customer_dt', 'order_status', 'payment_type'
                   , 'payment_value', 'product_id', 'price', 'freight_value']].sort_values('review_id').drop_duplicates()
tmp_dupl.head()

Comparing duplicate values across columns (replacing missing values with __na__):

In [None]:
(tmp_dupl.fillna({'review_comment_message': '__na__'})
 .groupby('review_id')
 [['review_comment_message', 'review_score', 'order_status', 'payment_type', 'payment_value', 'product_id', 'price']]
 .nunique()
 .apply(pd.Series.value_counts)
)

**Key Observations:**  

- Identical reviews were left for different orders with matching ratings and descriptions, but varying products/prices.- 
- This is unusual - could indicate bulk reviews for multiple orders or data collection errors.

---

**order_id**

Analyzing order_id duplicate distribution over time

In [None]:
tmp_dupl = df_reviews[df_reviews.order_id.duplicated()]

In [None]:
df_reviews.order_id.explore.anomalies_over_time(
    time_column='review_creation_dt'
    , anomaly_type='duplicate'
    , freq='W'
)

**Key Observations:**  

- March 2018 showed a spike in order_id duplicates (multiple reviews for single orders).

In [None]:
tmp_dupl = (tmp_dupl.merge(df_orders, on='order_id', how='left')
            .merge(df_customers, on='customer_id', how='left')
)

In [None]:
tmp_dupl = tmp_dupl[['order_id', 'review_id', 'review_score', 'review_comment_title', 'review_comment_message'
                   , 'review_creation_dt', 'order_delivered_customer_dt', 'order_status']].sort_values('order_id').drop_duplicates()
tmp_dupl.head(10)

**Key Observations:**  

- Customers sometimes left multiple reviews per order (e.g., one pre-delivery and one post-delivery) - not necessarily anomalous.

Let's examine how many duplicates share identical values across different columns.

For description fields, we'll replace missing values with __na__.

In [None]:
(tmp_dupl.fillna({'review_comment_message': '__na__'})
 .groupby('order_id')
 [['review_comment_message', 'review_score', 'order_status']]
 .nunique()
 .apply(pd.Series.value_counts)
)

Secondary Review Rating Comparison

We'll analyze whether follow-up reviews for the same order had higher or lower ratings:

- Compare average ratings with initial ratings
- If initial rating was lower, subsequent ratings were either equal or higher

In [None]:
(tmp_dupl.sort_values(['order_id', 'review_creation_dt'])
 .groupby('order_id')
 .agg(
     first_review_score = ('review_score', 'first')
     , mean_review_score = ('review_score', 'mean')
 )
 .assign(
     is_first_less_mean = lambda x: x.first_review_score < x.mean_review_score
 )
 ['is_first_less_mean']
 .value_counts()
)

**Key Observations:**  

- Subsequent reviews for the same order typically received lower ratings than the initial review.

In [None]:
del tmp_dupl

### 2.3.6 Table df_products

Let’s look at the information about the dataframe.

In [None]:
df_products.explore.info()

#### 2.3.6.1 Initial Column Analysis

We will examine each column individually.

**product_id**

In [None]:
df_products['product_id'].explore.info(plot=False)

**product_category_name**

In [None]:
df_products['product_category_name'].explore.info()

**Key Observations:**  

- product_category_name contains 2% missing values
- Dataset contains 73 unique product categories

**product_name_lenght**

In [None]:
df_products['product_name_lenght'].explore.info()

**Key Observations:**  

- product_name_lenght has 2% missing values
- Maximum product name length: 76 characters

**product_description_lenght**

In [None]:
df_products['product_description_lenght'].explore.info()

**Key Observations:**  

- There are 2% missing values in product_description_lenght.
- The maximum length of the product description is 3.99k characters.
- The minimum length of the product description is 4 characters.

**product_photos_qty**

In [None]:
df_products['product_photos_qty'].explore.info(column_type='categorical')

**Key Observations:**  

- In product_photos_qty, 2% of values are missing.
- The maximum number of photos for a single product is 20.
- 50% of products have 1 photo.

**product_weight_g**

In [None]:
df_products['product_weight_g'].explore.info()

**Key Observations:**  

- In product_weight_g, there are 2 missing values.
- In product_weight_g, there are 4 zero values.
- The maximum product weight is 40.42k grams.
- The product weight of 40.42k grams is clearly an outlier.

**product_length_cm**

In [None]:
df_products['product_length_cm'].explore.info()

**Key Observations:**  

- In product_length_cm, there are 2 missing values.
- The maximum product length is 105 cm. The minimum is 7 cm. The median is 25 cm.

**product_height_cm**

In [None]:
df_products['product_height_cm'].explore.info()

**Key Observations:**  

- In product_height_cm, there are 2 missing values.
- The maximum product height is 105 cm. The minimum is 2 cm. The median is 13 cm.

**product_width_cm**

In [None]:
df_products['product_width_cm'].explore.info()

**Key Observations:**  

- In product_width_cm, there are 2 missing values.
- The maximum product width is 118 cm. The minimum is 6 cm. The median is 20 cm.

#### 2.3.6.2 Exploring Missing Values

Let's see which columns have missing values.

In [None]:
df_products.explore.anomalies_report(
    anomaly_type='missing'
    , pct_diff_threshold=10
)

Let's hypothesize that missing values in the following columns are in the same rows:

- product category name
- product name length
- product description length
- number of product photos

In [None]:
df_products.explore.detect_simultaneous_anomalies(['product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty'])

**Key Observations:**  

- Missing values in product category name, product name length, product description length, and number of product photos are in the same rows.

Let's hypothesize that missing values in the following columns are in the same rows:

- product length
- product width
- product height
- product weight

In [None]:
df_products.explore.detect_simultaneous_anomalies(['product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm'])

**Key Observations:**  

- Missing values in product length, width, height, and weight are located in the same rows.

#### 2.3.6.3 Exploring Outliers

In [None]:
df_products.explore.anomalies_report(
    anomaly_type='outlier'
)

**Key Observations:**  

- The proportion of outliers in the number of photos, length, and width of products is within normal limits.
- The proportion of outliers in product weight and height exceeds the norm (usually 5%), but is not critical.

#### 2.3.6.4 Exploring Other Anomalies

Examining zero values

In [None]:
df_products.explore.anomalies_report(
    anomaly_type='zero'
)

**Key Observations:**  

- All 4 products with zero weight belong to the category cama_mesa_banho (home textiles).

### 2.3.7 Table df_categories

Let’s look at the information about the dataframe.

In [None]:
df_categories.explore.info()

#### 2.3.7.1 Initial Column Analysis

We will examine each column individually.

**product_category_name**

In [None]:
df_categories['product_category_name'].explore.info(plot=False)

**Key Observations:**  

- The product_category_name table has 71 unique product categories, while the products table has 73 categories.

**product_category_name_english**

In [None]:
df_categories['product_category_name_english'].explore.info(plot=False)

### 2.3.8 Table df_sellers

Let’s look at the information about the dataframe.

In [None]:
df_sellers.explore.info()

#### 2.3.8.1 Initial Column Analysis

We will examine each column individually.

**seller_id**

In [None]:
df_sellers['seller_id'].explore.info(plot=False)

**seller_zip_code_prefix**

In [None]:
df_sellers['seller_zip_code_prefix'].explore.info(plot=False)

**seller_city**

In [None]:
df_sellers['seller_city'].explore.info()

**Key Observations:**  

- The most sellers are from the city of sao paulo (22%).


**seller_state**

In [None]:
df_sellers['seller_state'].explore.info()

**Key Observations:**  

- The most sellers are from the state of sp (60%).


### 2.3.9 Table df_geolocations

Let’s look at the information about the dataframe.

In [None]:
df_geolocations.explore.info()

**Key Observations:**  

- The df_geolocations table has 28% fully duplicated rows.

#### 2.3.9.1 Initial Column Analysis

We will examine each column individually.

**geolocation_zip_code_prefix**

In [None]:
df_geolocations['geolocation_zip_code_prefix'].explore.info(plot=False)

**geolocation_lat**

In [None]:
df_geolocations['geolocation_lat'].explore.info(plot=False)

**geolocation_lng**

In [None]:
df_geolocations['geolocation_lng'].explore.info(plot=False)

**geolocation_city**

In [None]:
df_geolocations['geolocation_city'].explore.info()

**Key Observations:**  

- In geolocation_city, the most entries are for the city of sao paulo (16%).


**geolocation_state**

In [None]:
df_geolocations['geolocation_state'].explore.info()

**Key Observations:**  

- In geolocation_state, the most entries are for the state of SP (40%).


#### 2.3.9.2 Exploring Duplicates

We have complete row duplicates in the geolocation table. Let's examine them.

Let's check if we have duplicates in the geolocation table in the geolocation_zip_code_prefix field, excluding common duplicates.

In [None]:
tmp_geo = df_geolocations.drop_duplicates()

In [None]:
tmp_geo.explore.detect_anomalies('duplicate', columns=['geolocation_zip_code_prefix'])

**Key Observations:**  

- In the df_geolocations table, there are 97% duplicates in the geolocation_zip_code_prefix column.

Let's see why there are duplicates.

In [None]:
tmp_geo.groupby('geolocation_zip_code_prefix').nunique().head(10)

This makes sense, as geolocation_zip_code_prefix can have many different unique coordinates.

But we need to take this into account when joining tables, since we only have zip_code_prefix in the customer and seller tables.

When joining, we may get many duplicates.

We can average the coordinates, but we can't do the same with cities and states.

Let's check if we have multiple states for a single prefix.

In [None]:
tmp_geo.groupby('geolocation_zip_code_prefix').geolocation_state.nunique().sort_values(ascending=False).head()

Let's see the maximum number of cities with the same prefix.

In [None]:
tmp_geo.groupby('geolocation_zip_code_prefix').geolocation_city.nunique().sort_values(ascending=False).head()

**Key Observations:**  

- In the df_geolocations table, there are prefixes with 2 unique states.
- In the df_geolocations table, there are prefixes with 4 unique cities.

There's nothing we can do about this. We'll need to account for this when analyzing geolocation coordinates.

Since we have states in the customer and seller tables, we can avoid using city and state from the geolocation table.

And we can average the coordinates.

#### 2.3.9.3 Exploring Outliers

Let's see how many sales we have outside South America.

In [None]:
tmp_geo = df_geolocations.copy()
tmp_geo['in_south_america'] = (
    (tmp_geo.geolocation_lat >= -53.90) &  # Southern border
    (tmp_geo.geolocation_lat <= 12.45) &   # Northern border
    (tmp_geo.geolocation_lng >= -81.32) &  # Western border
    (tmp_geo.geolocation_lng <= -34.79)    # Eastern border
)

In [None]:
temp_df = (df_orders[['order_id', 'customer_id']].merge(df_customers, on='customer_id', how='left')
              .merge(tmp_geo.drop_duplicates(subset=["geolocation_zip_code_prefix"]), left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
              .dropna()
              [['in_south_america', 'customer_city', 'customer_state', 'geolocation_lat', 'geolocation_lng']]
)
temp_df.in_south_america.value_counts()

**Key Observations:**  

- There are 6 sales outside South America in the dataset.

Let's look at these orders.

In [None]:
temp_df[temp_df.in_south_america == False]

These coordinates are outside South America.
Either it's an error, or the order was placed outside South America.

- (42.18, -8.72) is off the coast of Spain/Portugal
- (20.09, -30.54) is in the central Atlantic Ocean
- (13.00, -23.58) is in the eastern Atlantic Ocean near Cape Verde
- (-11.31, -34.73) is in the South Atlantic
- (20.09, -30.54) is a repeating point in the central Atlantic Ocean

In [None]:
del temp_df, tmp_geo

### 2.3.10 Exploring Cross-Table Anomalies

#### 2.3.10.1 Temporal Boundary Checks

Examining time interval boundaries across different tables.

In [None]:
for key, df in dfs:
    datetime_cols = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col])]
    for col in datetime_cols:
        min_date = df[col].min()
        max_date = df[col].max()    
        print(f"[{min_date.date()} - {max_date.date()}] DataFrame '{key}', Column '{col}':")

**Key Observations:**  

- The latest date in order_approved_dt is earlier than in order_purchase_dt.- 
- Anomalously large maximum date in shipping_limit_dt compared to other temporal variables.

#### 2.3.10.2 Payment-Order Amount Mismatches

Checking for orders where payment total differs from order value.

In [None]:
temp_df = (
    df_items[['order_id', 'price', 'freight_value']]
    .groupby('order_id')
    .sum()
    .assign(total_price=lambda x: x['price'] + x['freight_value'])
    .drop(columns=['price', 'freight_value'])
    .reset_index()
    .merge(df_payments, on='order_id', how='inner')
    .merge(df_orders, on='order_id', how='inner')
    .dropna(subset=['payment_value', 'total_price'])
)
temp_df['payment_matches_total'] = temp_df['payment_value'].round(2) == temp_df['total_price'].round(2)
temp_df['payment_matches_total'].value_counts()

**Key Observations:**  

- Dataset contains 7,877 orders with payment-amount discrepancies.

In [None]:
tmp_mask = ~temp_df.payment_matches_total

Let's analyze by payment type.

In [None]:
temp_df.explore.anomalies_by_categories(
    custom_mask=tmp_mask
    , pct_diff_threshold=-100
    , include_columns='tmp_payment_types'
)

**Key Observations:**  

- Payment type analysis shows most mismatches involve voucher payments (likely systemic issue).

### 2.3.11 Exploring Relationships Between Tables

Reviewing inter-table connections for future joins and key consistency.

**df_orders and df_payments**

In [None]:
fron.analyze_join_keys(df_orders, df_payments, "order_id", short_result=False)

**Key Observations:**  

- Orders table contains 1 order_id missing from payments table.

Let's look at what this order is.

In [None]:
temp_df = df_orders.merge(df_payments, on='order_id', how='left')
temp_df[temp_df.payment_value.isna()]

**df_orders and df_items**

In [None]:
fron.analyze_join_keys(df_orders, df_items, "order_id", short_result=False)

**Key Observations:**  

- Orders table contains 775 order_ids missing from items table.

We have a payments table. Let's check if orders missing from items exist in payments.

In [None]:
missing_orders = (df_orders.merge(df_items, on='order_id', how='left')
                  [lambda x: x['order_item_id'].isna()].order_id.unique()
)
len(missing_orders)

In [None]:
df_payments[df_payments.order_id.isin(missing_orders)].order_id.nunique()

All these orders are present in the payments table.

Let's check how many of these orders are canceled.

In [None]:
df_orders[df_orders['order_id'].isin(missing_orders)].order_status.value_counts()

**Key Observations:**  

- These orders are either canceled, unavailable, or just created.

Let's examine orders with "shipped" status.

In [None]:
df_orders[df_orders['order_id'].isin(missing_orders) & (df_orders.order_status == 'Shipped')]

**df_orders and df_customers**

In [None]:
fron.analyze_join_keys(df_orders, df_customers, "customer_id", short_result=False)

**Key Observations:**  

- All is well..

**df_orders and df_reviews**

In [None]:
fron.analyze_join_keys(df_orders, df_reviews, "order_id", short_result=False)

**Key Observations:**  

- All is well..

**df_items and df_products**

In [None]:
fron.analyze_join_keys(df_items, df_products, "product_id", short_result=False)

**Key Observations:**  

- All is well..

**df_items and df_sellers**

In [None]:
fron.analyze_join_keys(df_items, df_sellers, "seller_id", short_result=False)

**Key Observations:**  

- All is well..

**df_customers and df_geolocations**

In [None]:
fron.analyze_join_keys(df_customers, df_geolocations, left_on = 'customer_zip_code_prefix', right_on = "geolocation_zip_code_prefix", short_result=False)

**Key Observations:**  

- In df_customers table, there are 157 zip_code_prefixes not present in df_geolocations.- 
- In df_geolocations table, there are 4178 zip_code_prefixes not present in df_customers.

**df_sellers and df_geolocations**

In [None]:
fron.analyze_join_keys(df_sellers, df_geolocations, left_on = 'seller_zip_code_prefix', right_on = "geolocation_zip_code_prefix", short_result=False)

**Key Observations:**  

- In df_sellers table, there are 7 zip_code_prefixes not present in df_geolocations.- 
- In df_geolocations table, there are 16776 zip_code_prefixes not present in df_sellers.

Delete temporary fields.

In [None]:
df_orders = df_orders[[col for col in df_orders.columns if not col.startswith('tmp_')]]

Clear memory of temporary variables.

In [None]:
for var_name in list(globals().keys()):
    if var_name.startswith('tmp_'):
        del globals()[var_name]

<h2 id="2-4"> 2.4 Intermediate Conclusion</h2>

Table Relationships

- Orders table contains 1 order missing from payments table
- Orders table contains 775 orders missing from order items table (mostly canceled/unavailable status)
- Customers table has 157 zip_code_prefixes missing from geolocations
- Geolocations table has 4,178 zip_code_prefixes missing from customers
- Sellers table has 7 zip_code_prefixes missing from geolocations
- Geolocations table has 16,776 zip_code_prefixes missing from sellers

Duplicates

- df_geolocations: 26% full row duplicates
- df_reviews: 827 duplicate review_ids
- df_reviews: 559 duplicate order_ids
- df_geolocations: 97% duplicates in geolocation_zip_code_prefix
- Some zip prefixes map to:
    - 2 unique states
    - 5 unique cities
    
Missing Values

- There are 160 missing values in order_approved_dt (<1% of the total number of rows).
- There are 14 missing values in order_approved_dt for orders with the delivered status.
- All delivered orders with missing values in order_approved_dt have the payment type boleto.
- There are 1.78k missing values in order_delivered_carrier_dt (2% of the total number of rows).
- There are 2 delivered orders with missing values in order_delivered_carrier_dt.
- All orders with the status unavailable have missing values in order_delivered_carrier_dt.
- Both orders with missing values in order_delivered_carrier_dt were paid with a credit card.
- There are 2.96k missing values in order_delivered_customer_dt (3% of the total number of rows).
- There are 2% missing values in product_description_lenght.
- There are 2% missing values in product_photos_qty.
- There are 2 missing values in product_weight_g.
- There are 2 missing values in product_length_cm.
- There are 2% missing values in product_category_name.
- There are 2 missing values in product_height_cm.
- There are 2 missing values in product_width_cm.
- Missing values in product category name, product description length, product description length, and product photos quantity are located in the same rows.
- Missing values in product length, width, height, and weight are located in the same rows.

Zero Values

- There are 383 zero values in freight_value.
- There are 2 orders that have a value of 0 in payment_installments.
- There are 9 zero payments in payment_value.
- There are 4 zero values in product_weight_g.
- There are 797 (95%) zero values in declared_monthly_revenue.
- The main part of zero values in freight_value occurred from April 22 to June 14, 2018. There might have been a free shipping promotion.
- All 4 products with zero weight have the category cama_mesa_banho (textile and home furnishings).
- Orders with zero payment amount have a payment type of either voucher or not_defined.

Other Anomalies

- There are 65 orders for which reviews were created before the orders were created. This is strange. Out of these, 58 orders were canceled, 6 were delivered, and 1 was in the delivery process.
- 6 orders were made outside South America, although the customer's state is in South America.
- The table product_category_name has 71 unique product categories, whereas the table products has 73 categories.
- One user made orders totaling 13,664. This value clearly stands out from the general distribution.
- There are also several users who made purchases totaling 6,000 or more.
- In the column with the total order amount per user, around 10% are outliers. This exceeds the usual norm (usually 5%), but is not a critical value.
- In the product price and freight value, there are slightly less than 10% outliers. This exceeds the usual norm (usually 5%), but is not a critical value.
- In the payment value, there are 10% outliers. This exceeds the usual norm (usually 5%), but is not a critical value.
- In the number of installments, there are less than 1% outliers. This is a normal value.
- On November 24, 2017, there were many outliers in the payment value.
- The proportion of outliers in the number of product photos, length, and width is within the norm.
- The proportion of outliers in weight (9.95%) and height (8.97%) of the product exceeds the norm (usually 5%), but is not critical.
- There is a strange maximum date in shipping_limit_dt. The value is significantly large compared to other time variables.
- Four orders have an abnormally large shipping_limit_dt. However, the estimated delivery time for these orders is normal.
- In the table with orders, there are 1,359 orders where order_delivered_carrier_dt is earlier than order_approved_dt.
- In the table with orders, there are 23 orders where order_delivered_customer_dt is earlier than order_delivered_carrier_dt.
- The dataset contains 8,707 orders where the payment amount does not match the order amount.
- In the table with orders, there are orders with the status canceled, but they also have a delivery time to the customer.
- In the table with order items, there are product IDs that were sold by different sellers. There are more than 1,000 such products. Additionally, there are products sold by sellers located in different states.

Key Observations

- All orders created after August 2018 were canceled, except for one order.
- There were few orders before 2017. Either the data is incomplete or there were few orders.
- 97% of orders have the status "delivered".
- Most customers are from the city of sao paulo (16%).
- Most customers are from the state SP (42%).
- geolocation_city has the most records for the city of sao paulo (14%).
- geolocation_state has the most records for the state SP (40%).
- The maximum number of product units in an order is 21.
- Product prices range from 0.85 to 6.74k.
- Most products are sold for prices ranging from 39.9 to 134.9.
- The median product price is 74.99.
- The maximum number of payment methods for one order is 29.
- The maximum number of installments for one product is 24.
- The median number of installments is 1.
- 75% of orders have installment plans of 4 or fewer parts.
- The maximum payment is 13.66k. The median payment is 100.
- 74% of payments were made using credit cards.
- The payment_type column has undefined payment types (<1%).
- The review_creation_dt column has 9% missing days.
- The review_answer_dt column has 5% missing days.
- More than half of the reviews (57%) have maximum ratings of 5.
- Most reviews (8%) have the title 'recomendo'.
- 58% of orders do not have reviews.
- Only 36% of unique comments in reviews.
- Most comments in reviews contain the word "muito bom" (1%).
- The maximum length of a product name is 76 characters.
- The maximum length of a product description is 3.99k characters.
- The minimum length of a product description is 4 characters.
- The maximum number of photos for one product is 20.
- 51% of products have 1 photo.
- The maximum product weight is 40.42k grams.
- The maximum product length is 105 cm. The minimum length is 7 cm. The median length is 25 cm.
- The maximum product height is 105 cm. The minimum height is 2 cm. The median height is 13 cm.
- The maximum product width is 118 cm. The minimum width is 6 cm. The median width is 20 cm.
- There are a total of 73 unique product categories.
- Most sellers are from the city of sao paulo (22%).
- Most sellers are from the state SP (60%).

<h1 id="3"> 3 Data Preprocessing</h1>

<h2 id="3-1"> 3.1 Initial Data Filtering</h2>

**Selecting the Time Period for Analysis**

Let's see how many orders there were before 2017.

In [None]:
df_orders[df_orders.order_approved_dt < '2017-04-01'].groupby(pd.Grouper(key='order_approved_dt', freq='ME')).agg({'order_id': 'nunique'})

**Key Observations:**

- Before 2017, either there were very few orders or the data is incomplete. It would be reasonable to use data starting from 2017.
- If we include data before 2017, the results for this period will not be statistically significant, as only October 2016 had a somewhat sufficient number of orders. This will manifest as anomalous values in average metrics on graphs due to small sample sizes in groups.

Let's see how many orders were approved after September 2018.

In [None]:
df_orders[df_orders.order_approved_dt > '2018-09-01']

**Key Observations:**  

- Only one order. We conclude that data after August 2018 is incomplete.

Let's examine what orders were created after August 2018.

In [None]:
df_orders[df_orders.order_purchase_dt > '2018-09-01']

**Key Observations:**  

- All orders created after August 2018 were canceled, except for one. This means we can safely trim the data up to September 2018.

As we determined, there were very few sales before January 2017, or the data is incomplete, so for more accurate analysis, we will consider data from January 2017.

Additionally, there is only one approved order after August 2018. Very few were created, and all were canceled, so we will analyze data up to and including August 2018 to avoid distorting results with incomplete data.

At the same time, it's important not to lose rows with missing values in order_approved_dt.

In [None]:
df_orders = df_orders[
    df_orders.order_purchase_dt.between(pd.to_datetime('2017-01-01'), pd.to_datetime('2018-09-01'), inclusive='left')
    | df_orders.order_purchase_dt.isna()
]

We’ll do the same for the reviews table. But we’ll only trim the lower date since reviews are created later than orders.

In [None]:
df_reviews = df_reviews[
    (df_reviews.review_creation_dt >= pd.to_datetime('2017-01-01'))
    | df_reviews.review_creation_dt.isna()
]

---

**Filtering by Order Presence**

To ensure data integrity, we kept only those records in related tables that have a corresponding order_id in the orders table.

We’ll keep only users present in the orders table.

In [None]:
fron.analyze_join_keys(df_customers, df_orders, on='customer_id', how='inner')

In [None]:
df_customers = df_customers.merge(df_orders[['customer_id']], on='customer_id', how='inner')

In [None]:
fron.analyze_join_keys(df_customers, df_orders, on='customer_id', only_coverage=True)

We’ll keep only payments present in the orders table.

In [None]:
fron.analyze_join_keys(df_payments, df_orders, on='order_id', how='inner')

In [None]:
df_payments = df_payments.merge(df_orders[['order_id']], on='order_id', how='inner')

In [None]:
fron.analyze_join_keys(df_payments, df_orders, on='order_id', only_coverage=True)

We’ll keep only reviews present in the orders table.

In [None]:
fron.analyze_join_keys(df_reviews, df_orders, on='order_id', how='inner')

In [None]:
df_reviews = df_reviews.merge(df_orders[['order_id']], on='order_id', how='inner')

In [None]:
fron.analyze_join_keys(df_reviews, df_orders, on='order_id', only_coverage=True)

We’ll keep only order items present in the orders table.

In [None]:
fron.analyze_join_keys(df_items, df_orders, on='order_id', how='inner')

In [None]:
df_items = df_items.merge(df_orders[['order_id']], on='order_id', how='inner')

In [None]:
fron.analyze_join_keys(df_items, df_orders, on='order_id', only_coverage=True)

Orders that exist in the orders table but not in the order items table cannot be deleted.

We’ll keep all products to identify which ones were never purchased.

We’ll also keep all sellers to see whose products didn’t sell.

<h2 id="3-2"> 3.2 Outlier Handling</h2>

Let’s examine where we have zero values.


In [None]:
for key, df in dfs:
    print(f'DataFrame {key}')
    df.explore.detect_anomalies(anomaly_type='zero')

Zeros in delivery cost cannot be processed, as they may indicate free shipping.

Zeros in product weight likely mean the value was not specified for the product.

Since there are few of them and they all belong to the same category (cama_mesa_banho), we’ll replace them with the median value for that category.

In [None]:
median_for_fill = df_products[df_products['product_category_name'] == 'cama_mesa_banho']['product_weight_g'].median()
df_products.loc[df_products['product_weight_g'] == 0, 'product_weight_g'] = median_for_fill

Zeros in payment_installments will be replaced with 1, as it is both the mode and median.

In [None]:
df_payments.loc[df_payments['payment_installments'] == 0, 'payment_installments'] = 1

Zeros in payment_value cannot be processed, as there may be a payment-specific reason.

We won’t process outliers in order value because they carry business significance—these are large purchases.

We have orders where reviews were created before the orders themselves.

We’ll consider these outliers, as they violate temporal logic.

But we don’t know whether the issue lies in the order creation date or the review creation date.

Modifying them could introduce bias, and since most of these orders were canceled and there are very few, we’ll leave them as is.

We’ve recorded them as anomalies and will note them in the conclusions.

<h2 id="3-3"> 3.3 Missing Value Handling</h2>

### 3.3.1 Table orders

Let’s examine missing values.

In [None]:
tmp_miss = df_orders.explore.detect_anomalies(return_mode='by_column')

To calculate time intervals between different dates, we should ideally fill missing values between dates if the next stage’s date is present.

We’ll replace missing timestamps.

**order_delivered_carrier_dt**

Let’s look at orders missing the carrier handoff date but having a delivery date or delivery status.

In [None]:
tmp_miss['order_delivered_carrier_dt'].order_status.value_counts()

In [None]:
tmp_mask = lambda x: (x.order_delivered_carrier_dt.isna()) & (x.order_delivered_customer_dt.notna() | (x.order_status == 'Delivered'))

In [None]:
df_orders[tmp_mask].head()

There are only 2 such orders—we’ll replace them with the median value.

In [None]:
order_carrier_time = (df_orders.order_delivered_carrier_dt - df_orders.order_approved_dt).median()
order_carrier_time

In [None]:
df_orders.loc[tmp_mask, 'order_delivered_carrier_dt'] = df_orders.order_approved_dt + order_carrier_time

**order_approved_dt**

Let’s examine orders missing the payment approval date but having a carrier handoff timestamp.

In [None]:
tmp_miss['order_approved_dt'].order_status.value_counts()

In [None]:
tmp_mask = lambda x: x.order_approved_dt.isna() & x.order_delivered_carrier_dt.notna()

In [None]:
print(f'Rows count: {df_orders[tmp_mask].shape[0]}')
df_orders[tmp_mask].head()

Since there are only 14 such orders, we’ll simply fill the gaps using this time.

Let’s check the median time from order creation to payment approval.

In [None]:
order_processing_time_hours = (df_orders.order_approved_dt - df_orders.order_purchase_dt).median()
order_processing_time_hours

In [None]:
df_orders.loc[tmp_mask, 'order_approved_dt'] = df_orders.order_purchase_dt + order_processing_time_hours

**order_delivered_customer_dt**

Let’s examine delivered orders missing the customer receipt date.

In [None]:
tmp_miss['order_delivered_customer_dt'].order_status.value_counts()

In [None]:
tmp_miss['order_delivered_customer_dt'][lambda x: x.order_status == 'Delivered']

There are only 8 such orders—we’ll also replace them with the median value.

In [None]:
order_customer_time = (df_orders.order_delivered_customer_dt - df_orders.order_delivered_carrier_dt).median()
order_customer_time

In [None]:
tmp_mask = df_orders.order_delivered_customer_dt.isna() & (df_orders.order_status=='Delivered')

In [None]:
df_orders.loc[tmp_mask, 'order_delivered_customer_dt'] = df_orders.order_delivered_carrier_dt + order_customer_time

Missing values in review_comment_title and review_comment_message indicate these fields were left blank, so processing them is unnecessary.

### 3.3.2 Table df_products

Let’s examine missing values in df_products.

In [None]:
tmp_miss = df_products.explore.detect_anomalies(return_mode='by_column')

In product_category_name, we’ll replace missing values with 'Missing in Products'.

In [None]:
df_products['product_category_name'] = df_products['product_category_name'].cat.add_categories(['Missing in Products'])
df_products['product_category_name'] = df_products['product_category_name'].fillna('Missing in Products')

Since only 2 orders lack specifications, we’ll replace them with the median value for their product category.

Let’s check group sizes if we replace within these groups.

In [None]:
category_columns = 'product_category_name'
df_products['product_weight_g'].preproc.check_group_counts(category_columns=category_columns)

The group sizes are sufficient, so we’ll replace with the group median.

In [None]:
df_products[df_products[['product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']].isna().any(axis=1)]

In [None]:
df_products['product_weight_g'].preproc.fill_missing_by_category(category_columns=category_columns, inplace=True)
df_products['product_length_cm'].preproc.fill_missing_by_category(category_columns=category_columns, inplace=True)
df_products['product_height_cm'].preproc.fill_missing_by_category(category_columns=category_columns, inplace=True)
df_products['product_width_cm'].preproc.fill_missing_by_category(category_columns=category_columns, inplace=True)

Missing values in product name length and product description length won’t be filled, as we won’t use them for analysis.

Missing values in photo count will be replaced with 1, as it’s both the median and mode.

In [None]:
df_products.loc[df_products.product_photos_qty.isna(), 'product_photos_qty'] = 1

Let’s check how many missing values remain.


In [None]:
df_products.explore.detect_anomalies()

In [None]:
del tmp_miss

<h2 id="3-4"> 3.4 Duplicate Handling</h2>

We have complete duplicate rows in the geolocation table. We'll remove them.

In [None]:
df_geolocations.drop_duplicates(inplace=True)

Since the customer and seller tables contain city and state information, while the geolocation table has multiple city/state entries for a single zip prefix,
we'll average coordinates by zip prefix and ignore city/state data.

For states, we'll calculate average coordinates as they'll be needed for geo-analysis.

Otherwise, we won't be able to accurately map customer/seller coordinates by zip prefix.

We'll verify that each zip prefix maps to only one state.

In [None]:
df_geolocations = df_geolocations.groupby('geolocation_zip_code_prefix')[['geolocation_lat', 'geolocation_lng']].mean().reset_index()

<h2 id="3-5"> 3.5 Creating New Metrics</h2>

We'll create new variables for future analysis.

Using 5-digit zip codes for geo-analysis may result in overly small sample sizes for some regions, so we'll work with 3-digit prefixes.

In [None]:
df_geolocations['geolocation_zip_code_prefix_3_digits'] = df_geolocations['geolocation_zip_code_prefix'].astype(str).str[0:3].astype(int)
df_customers['customer_zip_code_prefix_3_digits'] = df_customers['customer_zip_code_prefix'].astype(str).str[0:3].astype(int)
df_sellers['seller_zip_code_prefix_3_digits'] = df_sellers['seller_zip_code_prefix'].astype(str).str[0:3].astype(int)

### 3.5.1 Table customers

We'll add population data (2018) for each customer state.

In [None]:
population = {
    'AC': 869265,
    'AL': 3322820,
    'AP': 829494,
    'AM': 4080611,
    'BA': 14812617,
    'CE': 9075649,
    'DF': 3974703,
    'ES': 3972388,
    'GO': 6921161,
    'MA': 7035055,
    'MT': 3441998,
    'MS': 2748023,
    'MG': 21040662,
    'PA': 8513497,
    'PB': 3996496,
    'PR': 11348937,
    'PE': 9496294,
    'PI': 3264531,
    'RJ': 17159960,
    'RN': 3479010,
    'RS': 11329605,
    'RO': 1757589,
    'RR': 576568,
    'SC': 7075494,
    'SP': 45538936,
    'SE': 2278308,
    'TO': 1555229
}
df_customers['population'] = df_customers['customer_state'].str.upper().map(population)

### 3.5.2 Table orders

**Purchase-to-Payment Approval Time**

In [None]:
df_orders['from_purchase_to_approved_hours'] = df_orders.order_approved_dt - df_orders.order_purchase_dt

In [None]:
(df_orders['from_purchase_to_approved_hours'] < pd.Timedelta(0)).sum()

Convert to hours.

In [None]:
df_orders['from_purchase_to_approved_hours'] = df_orders.from_purchase_to_approved_hours.dt.total_seconds() / 3600

---

**Purchase-to-Carrier Handoff Time**

In [None]:
df_orders['from_purchase_to_carrier_days'] = df_orders.order_delivered_carrier_dt - df_orders.order_purchase_dt

In [None]:
(df_orders['from_purchase_to_carrier_days'] < pd.Timedelta(0)).sum()

Replace negative values with the median..

In [None]:
median_ = df_orders['from_purchase_to_carrier_days'][lambda x: x >= pd.Timedelta(0)].median()

In [None]:
df_orders.loc[lambda x: x['from_purchase_to_carrier_days'] < pd.Timedelta(0), 'from_purchase_to_carrier_days'] = median_

In [None]:
(df_orders['from_purchase_to_carrier_days'] < pd.Timedelta(0)).sum()

Convert to days..

In [None]:
df_orders['from_purchase_to_carrier_days'] = df_orders.from_purchase_to_carrier_days.dt.total_seconds() / (24 * 3600)

---

**Purchase-to-Customer Delivery Time**

In [None]:
df_orders['delivery_time_days'] = df_orders.order_delivered_customer_dt - df_orders.order_purchase_dt

In [None]:
(df_orders['delivery_time_days'] < pd.Timedelta(0)).sum()

Convert to days..

In [None]:
df_orders['delivery_time_days'] = df_orders['delivery_time_days'].dt.total_seconds() / (24 * 3600)

---

**Estimated Delivery Time**

In [None]:
df_orders['delivery_time_estimated_days'] = df_orders.order_estimated_delivery_dt - df_orders.order_purchase_dt

In [None]:
(df_orders['delivery_time_estimated_days'] < pd.Timedelta(0)).sum()

Convert to days.

In [None]:
df_orders['delivery_time_estimated_days'] = df_orders['delivery_time_estimated_days'].dt.total_seconds() / (24 * 3600)

---

**Actual vs. Estimated Delivery Time Difference**

In [None]:
df_orders['delivery_delay_days'] = df_orders.order_delivered_customer_dt - df_orders.order_estimated_delivery_dt

Convert to days..

In [None]:
df_orders['delivery_delay_days'] = df_orders['delivery_delay_days'].dt.total_seconds() / (24 * 3600)

---

**Payment Approval-to-Carrier Handoff Time**

In [None]:
df_orders['from_approved_to_carrier_days'] = df_orders.order_delivered_carrier_dt - df_orders.order_approved_dt

In [None]:
(df_orders['from_approved_to_carrier_days'] < pd.Timedelta(0)).sum()

Replace negative values with the median..

In [None]:
median_ = df_orders['from_approved_to_carrier_days'][lambda x: x >= pd.Timedelta(0)].median()

In [None]:
df_orders.loc[lambda x: x['from_approved_to_carrier_days'] < pd.Timedelta(0), 'from_approved_to_carrier_days'] = median_

In [None]:
(df_orders['from_approved_to_carrier_days'] < pd.Timedelta(0)).sum()

Convert to days..

In [None]:
df_orders['from_approved_to_carrier_days'] = df_orders.from_approved_to_carrier_days.dt.total_seconds() / (24 * 3600)

---

**Carrier Delivery Time**

In [None]:
df_orders['from_carrier_to_customer_days'] = df_orders.order_delivered_customer_dt - df_orders.order_delivered_carrier_dt

In [None]:
(df_orders['from_carrier_to_customer_days'] < pd.Timedelta(0)).sum()

Replace with median value.

In [None]:
median_ = df_orders['from_carrier_to_customer_days'][lambda x: x >= pd.Timedelta(0)].median()

In [None]:
df_orders.loc[lambda x: x['from_carrier_to_customer_days'] < pd.Timedelta(0), 'from_carrier_to_customer_days'] = median_

In [None]:
(df_orders['from_carrier_to_customer_days'] < pd.Timedelta(0)).sum()

Convert to days..

In [None]:
df_orders['from_carrier_to_customer_days'] = df_orders['from_carrier_to_customer_days'].dt.total_seconds() / (24 * 3600)

### 3.5.3 Table reviews

**Review Response Time**

In [None]:
df_reviews['review_answer_time_days'] = df_reviews.review_answer_dt - df_reviews.review_creation_dt

In [None]:
(df_reviews['review_answer_time_days'] < pd.Timedelta(0)).sum()

Convert to days..

In [None]:
df_reviews['review_answer_time_days'] = df_reviews['review_answer_time_days'].dt.total_seconds() / (24 * 3600)

**Review Character Length**

In [None]:
df_reviews['review_comment_message_len'] = df_reviews['review_comment_message'].str.len()

#### 3.5.3.1 Table items and product

Create a new variable for total product cost including shipping.

In [None]:
df_items['total_price'] = df_items['price'] + df_items['freight_value']

Create a new variable for product volume.

In [None]:
df_products['product_volume_cm3'] = df_products['product_length_cm'] * df_products['product_height_cm'] * df_products['product_width_cm']

Create a new weight-to-volume ratio variable.

In [None]:
df_products['weight_to_volume_ratio'] = (df_products['product_weight_g'] / df_products['product_volume_cm3']).round(2)

<h2 id="3-6"> 3.6 Creating New Dimensions</h2>

We'll create new dimensions for future analysis.

### 3.6.1 Table payments

Create a new dimension indicating installment payments.

In [None]:
df_payments['has_installments'] = (df_payments.payment_installments > 1)

In [None]:
df_payments['has_installments'] = df_payments.has_installments.map({True: 'Has Installments', False: 'No Installments'}).astype('category')

### 3.6.2 Table orders

**Delivery Failure Reason**

Categorize as follows (note: "approved" could fit both categories - we'll classify it as service-side):

- Service-side issues: shipped, processing, unavailable, approved
- Customer-side issues: created, invoiced, canceled

In [None]:
rules = {
    "No Issues": lambda x: x == 'Delivered',
    "Service Issue": lambda x: x.isin(['Approved', 'Shipped', 'Processing', 'Unavailable']),
    'Customer Issue': lambda x: x.isin(['Created', 'Invoiced', 'Canceled']),
    "Missing Status": "default"
}
df_orders['delivery_issue_reason'] = df_orders.order_status.preproc.to_categorical(rules=rules)

---

**Delivery Delay Status**

In [None]:
rules = {
    "Missing Delivery Dt": lambda x: df_orders['order_delivered_customer_dt'].isna() | df_orders['order_estimated_delivery_dt'].isna(),
    "Delayed": lambda x: df_orders['order_delivered_customer_dt'] > df_orders['order_estimated_delivery_dt'], 
    'Not Delayed': lambda x: df_orders['order_delivered_customer_dt'] <= df_orders['order_estimated_delivery_dt'],    
    "Missing Delivery Dt": "default"
}
df_orders['is_delayed'] = df_orders.order_delivered_customer_dt.preproc.to_categorical(rules=rules)

---

**Delivery Status**

In [None]:
rules = {
    "Delivered": lambda x: x == 'Delivered',
    "Not Delivered": lambda x: x != 'Delivered', 
    "Missing Status": "default"
}
df_orders['is_delivered'] = df_orders.order_status.preproc.to_categorical(rules=rules)

---

**Season**

Create a season variable.

Note: South American seasons differ from Europe due to hemispheric positioning.

In [None]:
rules = {
    "Summer": lambda x: x.dt.month.isin([12, 1, 2]),
    "Autumn": lambda x: x.dt.month.isin([3, 4, 5]),
    "Winter": lambda x: x.dt.month.isin([6, 7, 8]),
    "Spring": lambda x: x.dt.month.isin([9, 10, 11]),
    "Missing Purchase Dt": "default"
}
df_orders['purchase_season'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)

---

**Year**

In [None]:
df_orders['purchase_year'] = df_orders['order_purchase_dt'].dt.year.fillna('Missing Purchase Dt')
df_orders['purchase_year'] = df_orders['purchase_year'].astype('category')
df_orders['purchase_year'].value_counts()

---

**Month Name**

Create a month name variable for the orders table.

In [None]:
df_orders['purchase_month'] = df_orders['order_purchase_dt'].dt.month_name().fillna('Missing Purchase Dt')
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
df_orders['purchase_month'] = df_orders['purchase_month'].astype('category').cat.reorder_categories(month_order, ordered=True)
df_orders['purchase_month'].value_counts()

---

**Day Type**

In [None]:
rules = {
    "Weekend": lambda x: x.dt.day_of_week.isin([5, 6]),
    "Weekday": lambda x: x.dt.day_of_week.isin(range(5)),
    "Missing Purchase Dt": "default"
}
df_orders['purchase_day_type'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)

---

**Day of Week**

In [None]:
df_orders['purchase_weekday'] = df_orders['order_purchase_dt'].dt.day_name().fillna('Missing Purchase Dt')
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_orders['purchase_weekday'] = df_orders['purchase_weekday'].astype('category').cat.reorder_categories(weekday_order, ordered=True)
df_orders['purchase_weekday'].value_counts(dropna=False)

---

**Time of Day**

In [None]:
rules = {
    "Morning": lambda x: x.dt.hour.between(5,11),
    "Afternoon": lambda x: x.dt.hour.between(12,16),
    "Evening": lambda x: x.dt.hour.between(17,22),
    "Night": lambda x: x.dt.hour.isin([23, 0, 1, 2, 3, 4]),
    "Missing Purchase Dt": "default"
}
df_orders['purchase_time_of_day'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)

---

**Hour**

In [None]:
df_orders['purchase_hour'] = df_orders['order_purchase_dt'].dt.hour.fillna('Missing Purchase Dt')
df_orders['purchase_hour'] = df_orders['purchase_hour'].astype('category')
df_orders['purchase_hour'].value_counts()

---

**Delivery Time Category**

Let's look at quantiles

In [None]:
df_orders.delivery_time_days.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

Based on quantile analysis:

- Fast: ≤5 days
- Medium: 5-15 days
- Long: >15 days

In [None]:
labels = ['Fast', 'Medium', 'Long']
bins = [-np.inf, 5, 15, np.inf]

In [None]:
df_orders['delivery_time_days_cat'] = df_orders.delivery_time_days.preproc.to_categorical(method='custom_bins', labels=labels, bins=bins)

### 3.6.3 Table reviews

**Season**

In [None]:
rules = {
    "Summer": lambda x: x.dt.month.isin([12, 1, 2]),
    "Autumn": lambda x: x.dt.month.isin([3, 4, 5]),
    "Winter": lambda x: x.dt.month.isin([6, 7, 8]),
    "Spring": lambda x: x.dt.month.isin([9, 10, 11]),
    "Missing Review Dt": "default"
}
df_reviews['review_season'] = df_reviews.review_creation_dt.preproc.to_categorical(rules=rules)

---

**Day Type**

In [None]:
rules = {
    "Weekend": lambda x: x.dt.day_of_week.isin([5, 6]),
    "Weekday": lambda x: x.dt.day_of_week.isin(range(5)),
    "Missing Review Dt": "default"
}
df_reviews['review_day_type'] = df_reviews.review_creation_dt.preproc.to_categorical(rules=rules)

---

**Day of Week**

In [None]:
df_reviews['review_creation_weekday'] = df_reviews['review_creation_dt'].dt.day_name().fillna('Missing Review Dt')
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_reviews['review_creation_weekday'] = df_reviews['review_creation_weekday'].astype('category').cat.reorder_categories(weekday_order, ordered=True)
df_reviews['review_creation_weekday'].value_counts()

### 3.6.4 Table products

**Translate category names to English**

Verify key consistency.

In [None]:
fron.analyze_join_keys(df_products, df_categories, 'product_category_name', how='left')

Examine category discrepancies.

In [None]:
np.setdiff1d(df_products.product_category_name.unique(), df_categories.product_category_name.unique())

Add English translations.

In [None]:
df_categories.loc[df_categories.shape[0]] = ('pc_gamer', 'Gaming Pc')
df_categories.loc[df_categories.shape[0]] = ('portateis_cozinha_e_preparadores_de_alimentos', 'Kitchen Appliances And Food Preparers')
df_categories.loc[df_categories.shape[0]] = ('Missing in Products', 'Missing in Products')

Replace categories with English versions.

In [None]:
df_products = (df_products.merge(df_categories, on='product_category_name', how='left')
               .drop(columns=['product_category_name'])
               .rename(columns={'product_category_name_english':'product_category'})
)

Check for missing values.

In [None]:
df_products.explore.detect_anomalies()

---

**Generalized Product Categories**

Create broader product categories for visualization.

In [None]:
category_mapping = {
    'Furniture': [
        'Office Furniture', 'Furniture Decor', 'Furniture Living Room', 
        'Kitchen Dining Laundry Garden Furniture', 'Bed Bath Table', 
        'Home Comfort', 'Home Comfort 2', 'Home Construction', 
        'Garden Tools', 'Furniture Bedroom', 'Furniture Mattress And Upholstery'
    ],
    'Electronics': [
        'Auto', 'Computers Accessories', 'Gaming Pc', 'Musical Instruments', 
        'Consoles Games', 'Watches Gifts', 'Air Conditioning', 'Telephony', 
        'Electronics', 'Fixed Telephony', 'Tablets Printing Image', 
        'Computers', 'Small Appliances Home Oven And Coffee', 
        'Small Appliances', 'Audio', 'Signaling And Security', 
        'Security And Services'
    ],
    'Fashion': [
        'Fashio Female Clothing', 'Fashion Male Clothing', 
        'Fashion Bags Accessories', 'Fashion Shoes', 'Fashion Sport', 
        'Fashion Underwear Beach', 'Fashion Childrens Clothes', 'Baby', 
        'Cool Stuff'
    ],
    'Home & Garden': [
        'Housewares', 'Kitchen Appliances And Food Preparers', 
        'Home Confort', 'Home Appliances', 'Home Appliances 2', 
        'Flowers', 'Costruction Tools Garden', 'Garden Tools', 
        'Construction Tools Lights', 'Costruction Tools Tools', 
        'Luggage Accessories', 'La Cuisine', 'Pet Shop', 'Market Place'
    ],
    'Entertainment': [
        'Sports Leisure', 'Toys', 'Cds Dvds Musicals', 'Music', 
        'Dvds Blu Ray', 'Cine Photo', 'Party Supplies', 
        'Christmas Supplies', 'Arts And Craftmanship', 'Art'
    ],
    'Beauty & Health': [
        'Health Beauty', 'Perfumery', 'Diapers And Hygiene'
    ],
    'Food & Drinks': [
        'Food Drink', 'Drinks', 'Food'
    ],
    'Books & Stationery': [
        'Books General Interest', 'Books Technical', 
        'Books Imported', 'Stationery'
    ],
    'Industry & Construction': [
        'Construction Tools Construction', 'Construction Tools Safety', 
        'Industry Commerce And Business', 'Agro Industry And Commerce'
    ]
}

category_dict = {item: category for category, items in category_mapping.items() for item in items}
df_products['general_product_category'] = df_products['product_category'].map(category_dict).fillna('Missing in Products').astype('category')
df_products['general_product_category'].value_counts(dropna=False)

### 3.6.5 Table df_geolocations

Add a field indicating South American coordinates.

In [None]:
df_geolocations['in_south_america'] = (
    (df_geolocations.geolocation_lat >= -53.90) &  # Southern border
    (df_geolocations.geolocation_lat <= 12.45) &   # Northern border
    (df_geolocations.geolocation_lng >= -81.32) &  # Western border
    (df_geolocations.geolocation_lng <= -34.79)    # Eastern border
)

<h2 id="3-7"> 3.7 Converting Data to a Convenient Format</h2>

Replace state names with more readable versions.

In [None]:
state_name = {
    "AC" : "Acre",
    "AL" : "Alagoas",
    "AM" : "Amazonas",
    "AP" : "Amapa",
    "BA" : "Bahia",
    "CE" : "Ceara",
    "DF" : "Distrito Federal",
    "ES" : "Espirito Santo",
    "GO" : "Goias",
    "MA" : "Maranhao",
    "MG" : "Minas Gerais",
    "MS" : "Mato Grosso do Sul",
    "MT" : "Mato Grosso",
    "PA" : "Para",
    "PB" : "Paraiba",
    "PE" : "Pernambuco",
    "PI" : "Piaui",
    "PR" : "Parana",
    "RJ" : "Rio de Janeiro",
    "RN" : "Rio Grande do Norte",
    "RO" : "Rondonia",
    "RR" : "Roraima",
    "RS" : "Rio Grande do Sul",
    "SC" : "Santa Catarina",
    "SE" : "Sergipe",
    "SP" : "Sao Paulo",
    "TO" : "Tocantins"
}

In [None]:
df_customers['customer_state_short'] = df_customers['customer_state'].str.upper()

In [None]:
df_customers['customer_state'] = (
    df_customers['customer_state']
    .str.upper()
    .replace(state_name)
    .astype('category')
)

df_sellers['seller_state'] = (
    df_sellers['seller_state']
    .str.upper()
    .replace(state_name)
    .astype('category')
)

Verify no states were lost.

In [None]:
df_customers['customer_state'].isna().sum()

In [None]:
df_sellers['seller_state'].isna().sum()

Check for fractional values in product specifications.

In [None]:
for col in df_products.columns.difference(['product_id', 'product_category', 'general_product_category']):
    print(f'{col}: {(df_products[col] % 1 != 0).sum()}')

All is well - convert to integer type.

In [None]:
df_products.product_height_cm = df_products.product_height_cm.astype('int32')
df_products.product_length_cm = df_products.product_length_cm.astype('int32')
df_products.product_photos_qty = df_products.product_photos_qty.astype('int32')
df_products.product_weight_g = df_products.product_weight_g.astype('int32')
df_products.product_width_cm = df_products.product_width_cm.astype('int32')
df_products.head(1)

<h2 id="3-8"> 3.8 Data Merging</h2>

### 3.8.1 Enriching Table df_items

**Buyer-Seller Distance**

Create a distance variable.

In [None]:
tmp_df_customers = df_customers.merge(df_geolocations, left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
tmp_df_customers.rename(columns={'geolocation_lat': 'lat_customer', 'geolocation_lng': 'lng_customer'}, inplace=True)
tmp_df_customers.drop(['geolocation_zip_code_prefix', 'geolocation_zip_code_prefix_3_digits'], axis=1, inplace=True)

In [None]:

tmp_df_sellers = df_sellers.merge(df_geolocations, left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
tmp_df_sellers.rename(columns={'geolocation_lat': 'lat_seller', 'geolocation_lng': 'lng_seller'}, inplace=True)
tmp_df_sellers.drop(['geolocation_zip_code_prefix', 'geolocation_zip_code_prefix_3_digits'], axis=1, inplace=True)

In [None]:
df_items = (
    df_items.merge(df_orders[['order_id', 'customer_id']], on='order_id', how='left')
    .merge(tmp_df_customers[['customer_id', 'lat_customer', 'lng_customer']], on='customer_id', how='left')
    .merge(tmp_df_sellers[['seller_id', 'lat_seller', 'lng_seller']], on='seller_id', how='left')
)

Calculate buyer-seller distances.

In [None]:
df_items['distance_km'] = fron.haversine_vectorized(
    df_items['lat_customer'].values,
    df_items['lng_customer'].values,
    df_items['lat_seller'].values,
    df_items['lng_seller'].values
)

In [None]:
df_items.drop(['customer_id', 'lat_customer', 'lng_customer', 'lat_seller', 'lng_seller'], axis=1, inplace=True)

Check for missing values.

In [None]:
df_items.distance_km.isna().sum()

Missing values occur when zip prefixes are absent from the geolocation table for some buyers/sellers.

The quantity is small - leave as is.

**Carrier Handoff Delay**

Create a carrier handoff delay variable.

In [None]:
df_items = df_items.merge(df_orders[['order_id', 'order_delivered_carrier_dt']], on='order_id', how='left')

In [None]:
df_items['carrier_delivery_delay_days'] = df_items['order_delivered_carrier_dt'] - df_items['shipping_limit_dt']

Convert to days..

In [None]:
df_items['carrier_delivery_delay_days'] = df_items['carrier_delivery_delay_days'].dt.total_seconds() / (24 * 3600)

In [None]:
df_items.drop(['order_delivered_carrier_dt'], axis=1, inplace=True)

### 3.8.2 Enriching Table df_orders

#### 3.8.2.1 From Table  df_payments

Create these order-level metrics:

- Payment count
- Payment sum
- Average payment
- Total installment count

Create these order-level dimensions:

- Installment presence
- Payment types

In [None]:
tmp_df_pay_agg = df_payments.copy()
tmp_df_pay_agg['has_installments'] = tmp_df_pay_agg['has_installments'] == 'Has Installments'

Examine unique payment types per order.

In [None]:
tmp_df_pay_agg.groupby('order_id')['payment_type'].nunique().value_counts()

Since maximum is 2 payment types, concatenate them during aggregation.

In [None]:
tmp_df_pay_agg = (
    tmp_df_pay_agg.groupby('order_id', as_index=False)
    .agg(
        payments_cnt = ('payment_sequential', 'count')
        , total_payment = ('payment_value', 'sum')
        , avg_payment = ('payment_value', 'mean')
        , total_installments_cnt = ('payment_installments', 'sum')
        , order_has_installment = ('has_installments', 'any')
        , order_payment_types = ('payment_type', lambda x: ', '.join(sorted(set(x))))
    )
)

In [None]:
tmp_df_pay_agg['order_payment_types'] = tmp_df_pay_agg['order_payment_types'].astype('category')

In [None]:
tmp_df_pay_agg['order_has_installment'] = tmp_df_pay_agg.order_has_installment.map({True: 'Has Installments', False: 'No Installments'}).astype('category')

Merge with df_orders.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_orders, tmp_df_pay_agg, "order_id", how='left')

Merge tables.

In [None]:
df_orders = df_orders.merge(tmp_df_pay_agg, on='order_id', how='left')

#### 3.8.2.2 From Tables df_items and df_products

Create these order-level metrics:

- Total product count
- Unique product count
- Seller count
- Unique category count
- Total product price
- Average product price
- Total shipping cost
- Total order value
- Shipping cost ratio
- Order weight
- Order volume
- Average buyer-seller distance
- Average carrier handoff delay

Create these order-level dimensions:

- Free shipping indicator
- Generalized product categories

First merge items and products tables.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_items, df_products, "product_id", how='left')

Merge tables.

In [None]:
tmp_df_items_prods = df_items.merge(df_products, on='product_id', how='left')

Prepare dataframe.

Examine unique categories per order.

In [None]:
tmp_df_items_prods.groupby('order_id')['product_category'].nunique().value_counts()

Maximum 3 categories - concatenate during aggregation.

Examine unique generalized categories per order.

In [None]:
tmp_df_items_prods.groupby('order_id')['general_product_category'].nunique().value_counts()

Maximum 3 categories - concatenate during aggregation.

In [None]:
tmp_df_items_prods_agg = (tmp_df_items_prods.groupby('order_id', as_index=False)
          .agg(
              products_cnt = ('product_id', 'count')
              , unique_products_cnt = ('product_id', 'nunique')
              , sellers_cnt = ('seller_id', 'nunique')
              , product_categories_cnt = ('product_category', 'nunique')
              , total_products_price = ('price', 'sum')
              , avg_products_price = ('price', 'mean')
              , total_freight_value = ('freight_value', 'sum')
              , total_order_price = ('total_price', 'sum')
              , total_weight_kg = ('product_weight_g', 'sum')
              , total_volume_cm3 = ('product_volume_cm3', 'sum')
              , avg_distance_km = ('distance_km', 'mean')
              , avg_carrier_delivery_delay_days = ('carrier_delivery_delay_days', 'mean')
              , order_product_categories = ('product_category', lambda x: ', '.join(sorted(set(x))))
              , order_general_product_categories = ('general_product_category', lambda x: ', '.join(sorted(set(x))))
          )
)

In [None]:
tmp_df_items_prods_agg['freight_ratio'] = tmp_df_items_prods_agg['total_freight_value'] / tmp_df_items_prods_agg['total_order_price']

In [None]:
tmp_df_items_prods_agg['order_product_categories'] = tmp_df_items_prods_agg['order_product_categories'].astype('category')
tmp_df_items_prods_agg['order_general_product_categories'] = tmp_df_items_prods_agg['order_general_product_categories'].astype('category')

In [None]:
tmp_df_items_prods_agg['total_weight_kg'] = (tmp_df_items_prods_agg['total_weight_kg'] / 1000).round(2)

In [None]:
tmp_df_items_prods_agg['order_is_free_shipping'] = tmp_df_items_prods_agg.total_freight_value == 0

In [None]:
tmp_df_items_prods_agg['order_is_free_shipping'] = tmp_df_items_prods_agg.order_is_free_shipping.map({True: 'Free Shipping', False: 'Paid Shipping'}).astype('category')

In [None]:
tmp_df_items_prods_agg['order_is_free_shipping'].value_counts(dropna=False)

Add new fields to df_orders.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_orders, tmp_df_items_prods_agg, "order_id", how='left')

We previously identified orders in df_orders missing from df_items.

These are all either canceled or unavailable - missing values from items table are expected.

Merge tables.

In [None]:
df_orders = df_orders.merge(tmp_df_items_prods_agg, on='order_id', how='left')

#### 3.8.2.3 From Table  df_customers

Add customer city and state.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_orders, df_customers, "customer_id", how='left')

Add fields to df_orders.

In [None]:
df_orders = df_orders.merge(df_customers[['customer_id', 'customer_unique_id', 'customer_state', 'customer_city']], on='customer_id', how='left')

#### 3.8.2.4 From Table  df_reviews

Create these order-level metrics:

- Review count
- Average review score

In [None]:
tmp_df_reviews_agg = (
    df_reviews.groupby('order_id', as_index=False)
    .agg(
        reviews_cnt = ('review_id', 'nunique')
        , order_avg_reviews_score = ('review_score', 'mean')
    )
)

Since secondary reviews tend to be lower-scored, round down.

In [None]:
tmp_df_reviews_agg['order_avg_reviews_score'] = np.floor(tmp_df_reviews_agg['order_avg_reviews_score']).astype(int)

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_orders, tmp_df_reviews_agg, "order_id", how='left')

Add fields to df_orders.

In [None]:
df_orders = df_orders.merge(tmp_df_reviews_agg, on='order_id', how='left')

#### 3.8.2.5 Creating New Dimensions

**Create a new payment amount dimension**

Let’s look at the quantiles in the column.

In [None]:
df_orders.total_payment.quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Create the following categories:

- Cheap: ≤50 R$
- Medium: 50-200 R$
- Expensive: >200 R$

In [None]:
labels = ['Cheap', 'Medium', 'Expensive']
bins = [-np.inf, 50, 200, np.inf]

In [None]:
df_orders['order_total_payment_cat'] = df_orders.total_payment.preproc.to_categorical(method='custom_bins', labels=labels, bins=bins)

**Create a new order weight dimension**

Let’s look at the quantiles in the column.

In [None]:
df_orders.total_weight_kg.quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Categories for order weight:

- Light: ≤500g
- Medium: 500-5000g
- Heavy: >5kg

In [None]:
labels = ['Light', 'Medium', 'Heavy']
bins = [-np.inf, 0.5, 5, np.inf]

In [None]:
df_orders['order_total_weight_cat'] = (
    df_orders.total_weight_kg.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Missing in Items')
)

**Create a new order volume dimension**

Let’s look at the quantiles in the column.

In [None]:
df_orders.total_volume_cm3.quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Categories for order volume:

- Small: ≤3500 cm3
- Medium: 3500-10000 cm3
- Large: >10000 cm3

In [None]:
labels = ['Small', 'Medium', 'Large']
bins = [-np.inf, 3500, 10000, np.inf]

In [None]:
df_orders['order_total_volume_cat'] = (
    df_orders.total_volume_cm3.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Missing in Items')
)

**Create a new review score dimension**

Categories:

- Positive: 4-5
- Neutral: 3
- Negative: 1-2

In [None]:
rules = {
    "Positive": lambda x: x.isin([4, 5]),
    "Neutral": lambda x: x == 3,
    'Negative': lambda x: x.isin([1, 2]),
    "Missing Score": "default"
}
df_orders['order_review_sentiment'] = df_orders.order_avg_reviews_score.preproc.to_categorical(rules=rules)

Let’s look at the missing values.

In [None]:
df_orders.explore.detect_anomalies()

- Missing values in time-related variables occur due to absent dates (normal for undelivered orders)
- Missing values in product-related variables occur because some orders aren't in the items table (canceled/unavailable). These won't affect sales analysis.

For dimensions, replace missing values with 'No Order In Items'

In [None]:
df_orders['order_product_categories'] = df_orders['order_product_categories'].cat.add_categories('No Order in Items').fillna('No Order in Items')
df_orders['order_general_product_categories'] = df_orders['order_general_product_categories'].cat.add_categories('No Order in Items').fillna('No Order in Items')
df_orders['order_is_free_shipping'] = df_orders['order_is_free_shipping'].cat.add_categories('No Order in Items').fillna('No Order in Items')

### 3.8.3 Creating Table df_sales

The orders table contains order creation time, payment approval time, and order status. After examining statuses:

- **created**: Few orders, old, comments indicate non-delivery
- **approved**: Few orders, old, no comments
- **processing**: >90% have 1-2 star reviews, most undelivered (some reviews mention stockouts)
- **invoiced**: >80% have 1-2 stars, mostly undelivered (some mention stockouts)
- **shipped**: >70% have 1-2 stars, mostly undelivered

Let's look at how the total delivery time is distributed.

In [None]:
df_orders.viz.histogram(
    x='delivery_time_estimated_days'
    , labels={'delivery_time_estimated_days': 'Delivery time to customer, days'}
    , title='Distribution of delivery time to customer'
)

Delivery typically takes ≤1 month. We'll use this threshold to determine delivery status.

Let's look at how the statuses are distributed for orders that have passed a month from the estimated delivery date.

In [None]:
tmp_last_date = df_orders.order_purchase_dt.max()

In [None]:
tmp_mask = lambda x: ((tmp_last_date - x.order_estimated_delivery_dt).dt.days > 31) & (x.order_status != 'Delivered')

In [None]:
df_orders[tmp_mask].order_status.value_counts()

**Key Observations:**

- Since delivery usually takes ≤1 month, orders exceeding this are considered undelivered (confirmed by review content mentioning non-delivery/stockouts).

We'll define a purchase as:

- Orders without 'canceled'/'unavailable' status
- Orders with 'delivered' status
- Orders without 'delivered' status where <31 days since purchase

In [None]:
tmp_mask = (
    (df_orders.order_status == 'Delivered') |
    (
        ((tmp_last_date - df_orders.order_estimated_delivery_dt).dt.days <= 31) & 
        (~df_orders.order_status.isin(['Canceled', 'Unavailable'])) 
    )
)

In [None]:
df_sales = df_orders[tmp_mask]

Let's look at the count by statuses.

In [None]:
df_orders.order_status.value_counts()

We'll create an order-level variable indicating purchase conversion.

In [None]:
df_orders['is_purchase'] = tmp_mask.map({True: 'Purchase', False: 'Not Purchase'}).astype('category')

In [None]:
df_orders.is_purchase.value_counts(dropna=False)

**Create first purchase flag**

To handle multiple purchases within the same timestamp, use ranking.

In [None]:
# Sort by customer and purchase date for correct ranking
df_sales = df_sales.sort_values(['customer_unique_id', 'order_purchase_dt'])

# Global first purchase flag (with tie-breaking via purchase_rank)
df_sales['customer_first_purchase_dt'] = df_sales.groupby('customer_unique_id')['order_purchase_dt'].transform('min')
df_sales['purchase_rank'] = df_sales.groupby('customer_unique_id').cumcount()
df_sales['sale_is_customer_first_purchase'] = (
    (df_sales['order_purchase_dt'] == df_sales['customer_first_purchase_dt']) 
    & (df_sales['purchase_rank'] == 0)
)

df_sales.drop(columns=['purchase_rank'], inplace=True)

Let’s look at the missing values.

In [None]:
df_sales.explore.detect_anomalies()

Results are as expected.

### 3.8.4 Enriching Table df_customers

customer_unique_id isn't unique in df_customers (data characteristic). For analysis, we'll:

- Save original table under new name
- Remove duplicates from df_customers for user analysis

In [None]:
df_customers_origin = df_customers.copy()

In [None]:
df_customers = df_customers.drop_duplicates('customer_unique_id').drop('customer_id', axis=1)

#### 3.8.4.1 From Table  df_orders

Create metrics for each customer:

- Order count
- Canceled order count
- Cancelation rate

In [None]:
tmp_df_orders_agg = df_orders.copy()
tmp_df_orders_agg['is_canceled'] = tmp_df_orders_agg['order_status'] == 'Canceled'
tmp_df_orders_agg['is_not_delivered'] = tmp_df_orders_agg['order_status'] != 'Delivered'
tmp_df_orders_agg['is_customer_issue'] = tmp_df_orders_agg['delivery_issue_reason'] == 'Customer Issue'
tmp_df_orders_agg['is_service_issue'] = tmp_df_orders_agg['delivery_issue_reason'] == 'Service Issue'
tmp_df_orders_agg = (
    tmp_df_orders_agg.groupby('customer_unique_id', as_index=False)
    .agg(
        orders_cnt = ('order_id', 'nunique')
        , canceled_share = ('is_canceled', 'mean')
        , canceled_orders_cnt= ('is_canceled', 'sum')
        , not_delivered_share = ('is_not_delivered', 'mean')
        , customer_issue_share = ('is_customer_issue', 'mean')
        , service_issue_share = ('is_service_issue', 'mean')
    )
)

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_orders, tmp_df_reviews_agg, "order_id", how='left')

Merge tables.

In [None]:
df_customers = df_customers.merge(tmp_df_orders_agg, on='customer_unique_id', how='left')

#### 3.8.4.2 From Table  df_sales

Create metrics for each customer:

- Purchase count
- Average delivery time
- Average delivery delay
- Delayed order rate
- Weekend purchase rate
- Average inter-purchase time
- Average payment count
- Total payments
- Average order value
- Average single payment amount
- Installment order rate
- Average product count per order
- Average unique product count
- Average seller count per order
- Total product value
- Average product value
- Average single product value
- Average shipping cost
- Average order weight
- Average order volume
- Free shipping rate
- Review count
- Average review score
- Active months count
- Max consecutive active months

Create dimensions for each customer:

- Installment usage
- Payment types
- Top 3 purchase weekdays
- Top 3 product categories
- Top 3 generalized categories
- First purchase date

**Top 3 Weekdays**

In [None]:
df_sales.groupby('customer_unique_id')['purchase_weekday'].nunique().value_counts()

Aggregate top 3 weekdays by purchase frequency.

In [None]:
top_purchase_weekdays = (
    df_sales.groupby(['customer_unique_id', 'purchase_weekday'], as_index=False, observed=True)
    .size()  
    .sort_values(by=['customer_unique_id', 'size'], ascending=[True, False]) 
    .groupby('customer_unique_id')
    .head(3) 
)
top_purchase_weekdays = (
    top_purchase_weekdays.groupby('customer_unique_id', as_index=False)
    .agg(
        customer_top_purchase_weekdays = ('purchase_weekday', lambda x: ', '.join(sorted(set(x))))
    )
)

**Top 3 Payment Types:**

In [None]:
payment_types = (
    df_sales.merge(df_payments, on='order_id', how='left')
)
payment_types.groupby('customer_unique_id')['payment_type'].nunique().value_counts()

Concatenate into string.

In [None]:
payment_types = (
    payment_types.groupby('customer_unique_id', as_index=False)
    .agg(
        customer_payment_types = ('payment_type', lambda x: ', '.join(sorted(set(x))))
    )
)

**Top 3 Product Categories:**

In [None]:
top_product_categories = (
    df_sales.merge(df_items, on='order_id', how='left')
    .merge(df_products, on='product_id', how='left')
)
(top_product_categories.drop_duplicates(['customer_unique_id', 'order_id', 'product_category'])
.groupby(['customer_unique_id'])['product_category']
.nunique()
.value_counts()
)

Aggregate top 3 categories by frequency.

In [None]:
top_product_categories = (
    top_product_categories.drop_duplicates(['customer_unique_id', 'order_id', 'product_category'])
    .groupby(['customer_unique_id', 'product_category'], as_index=False, observed=True)
    .size()  
    .sort_values(by=['customer_unique_id', 'size'], ascending=[True, False]) 
    .groupby('customer_unique_id')
    .head(3) 
)
top_product_categories = (
    top_product_categories.groupby('customer_unique_id', as_index=False)
    .agg(
        customer_top_product_categories = ('product_category', lambda x: ', '.join(sorted(set(x))))
    )
)

**Top 3 Generalized Categories**

In [None]:
top_general_product_categories = (
    df_sales.merge(df_items, on='order_id', how='left')
    .merge(df_products, on='product_id', how='left')
)
(top_general_product_categories.drop_duplicates(['customer_unique_id', 'order_id', 'general_product_category'])
.groupby(['customer_unique_id'])['general_product_category']
.nunique()
.value_counts()
)

Aggregate top 3 categories by frequency.

In [None]:
top_general_product_categories = (
    top_general_product_categories.drop_duplicates(['customer_unique_id', 'order_id', 'general_product_category'])
    .groupby(['customer_unique_id', 'general_product_category'], as_index=False, observed=True)
    .size()  
    .sort_values(by=['customer_unique_id', 'size'], ascending=[True, False]) 
    .groupby('customer_unique_id')
    .head(3) 
)
top_general_product_categories = (
    top_general_product_categories.groupby('customer_unique_id', as_index=False)
    .agg(
        customer_top_general_product_categories = ('general_product_category', lambda x: ', '.join(sorted(set(x))))
    )
)

In [None]:
tmp_df_sales_agg = df_sales.copy()
tmp_df_sales_agg['is_delayed'] = tmp_df_sales_agg['is_delayed'] == 'Delayed'
tmp_df_sales_agg['is_purchase_weekend'] = tmp_df_sales_agg['purchase_day_type'] == 'Weekend'
tmp_df_sales_agg['order_has_installment'] = tmp_df_sales_agg['order_has_installment'] == 'Has Installments'
tmp_df_sales_agg['order_is_free_shipping'] = tmp_df_sales_agg['order_is_free_shipping'] == 'Free Shipping'

In [None]:
tmp_df_sales_agg = (
    tmp_df_sales_agg.groupby('customer_unique_id', as_index=False)
    .agg(
        buys_cnt = ('order_id', 'nunique')
        , avg_delivery_time_days = ('delivery_time_days', 'mean')
        , avg_delivery_delay_days = ('delivery_delay_days', 'mean')
        , delayed_orders_share = ('is_delayed', 'mean')
        , purchase_weekend_share = ('is_purchase_weekend', 'mean')
        , repeat_purchase_share = ('sale_is_customer_first_purchase', 'mean')
        , avg_payments_cnt = ('payments_cnt', 'mean')
        , total_customer_payment = ('total_payment', 'sum')
        , avg_total_order_payment = ('total_payment', 'mean')
        , avg_individual_payment = ('avg_payment', 'mean')
        , installment_orders_share = ('order_has_installment', 'mean')
        , avg_products_cnt = ('products_cnt', 'mean')
        , avg_unique_products_cnt = ('unique_products_cnt', 'mean')
        , avg_sellers_cnt = ('sellers_cnt', 'mean')
        , avg_order_total_products_price = ('total_products_price', 'mean')
        , avg_total_order_price = ('total_order_price', 'mean')
        , avg_products_price = ('avg_products_price', 'mean')
        , total_products_price = ('total_products_price', 'sum')
        , avg_order_total_freight_value = ('total_freight_value', 'mean')
        , avg_order_total_weight_kg = ('total_weight_kg', 'mean')
        , avg_order_total_volume_cm3 = ('total_volume_cm3', 'mean')
        , free_shipping_share = ('order_is_free_shipping', 'mean')
        , reviews_cnt = ('reviews_cnt', 'sum')
        , customer_avg_reviews_score = ('order_avg_reviews_score', 'mean')      
        , avg_distance_km = ('avg_distance_km', 'mean')
        , first_purchase_dt = ('order_purchase_dt', 'min')
        , last_purchase_dt=('order_purchase_dt', 'max')
        , from_first_to_second_days=('order_purchase_dt', lambda x: (x.nsmallest(2).iloc[-1] - x.min()).total_seconds() / (3600*24) if len(x) > 1 else np.nan)
        , from_first_to_last_days=('order_purchase_dt', lambda x: (x.max() - x.min()).total_seconds() / (3600*24) if len(x) > 1 else np.nan)
    )
    .merge(top_purchase_weekdays, on='customer_unique_id', how='left')
    .merge(payment_types, on='customer_unique_id', how='left')
    .merge(top_product_categories, on='customer_unique_id', how='left')
    .merge(top_general_product_categories, on='customer_unique_id', how='left')
          
)

We calculated the average value of the column 'sale_is_customer_first_purchase', but to get the proportion of repeat purchases, we need to subtract this value from 1.


In [None]:
tmp_df_sales_agg['repeat_purchase_share'] = 1 - tmp_df_sales_agg['repeat_purchase_share']

Missing values in dimensions for customers without df_items orders will show 'No Order In Items'.

In [None]:
tmp_df_sales_agg['customer_top_product_categories'] = tmp_df_sales_agg['customer_top_product_categories'].fillna('No Order in Items')
tmp_df_sales_agg['customer_top_general_product_categories'] = tmp_df_sales_agg['customer_top_general_product_categories'].fillna('No Order in Items')

In [None]:
tmp_df_sales_agg['customer_top_purchase_weekdays'] = tmp_df_sales_agg['customer_top_purchase_weekdays'].astype('category')
tmp_df_sales_agg['customer_payment_types'] = tmp_df_sales_agg['customer_payment_types'].astype('category')
tmp_df_sales_agg['customer_top_product_categories'] = tmp_df_sales_agg['customer_top_product_categories'].astype('category')
tmp_df_sales_agg['customer_top_general_product_categories'] = tmp_df_sales_agg['customer_top_general_product_categories'].astype('category')

**Inter-purchase Time**

In [None]:
tmp_df_diff_days = (df_sales[['customer_unique_id', 'order_purchase_dt']]
          .sort_values(['customer_unique_id', 'order_purchase_dt'])
)
tmp_df_diff_days['avg_buys_diff_days'] = (tmp_df_diff_days.groupby(['customer_unique_id'])['order_purchase_dt']
                  .diff()
                  .apply(lambda x: x.days + x.seconds / (24 * 3600))
)
tmp_df_diff_days.dropna(subset='avg_buys_diff_days', inplace=True)
tmp_df_diff_days = (tmp_df_diff_days.groupby('customer_unique_id', as_index=False)
               .agg(avg_buys_diff_days = ('avg_buys_diff_days', 'mean')
               )
)

Check key mismatches.

In [None]:
fron.analyze_join_keys(tmp_df_sales_agg, tmp_df_diff_days, "customer_unique_id", how='left')

NA for single-purchase customers (expected).

Merge tables.

In [None]:
tmp_df_sales_agg = tmp_df_sales_agg.merge(tmp_df_diff_days, on='customer_unique_id', how='left')

**Active Months**

In [None]:
tmp_df_months_with_buys = (
    df_sales.assign(
        year_month = lambda x: x.order_purchase_dt.dt.to_period('M')
    )
    .groupby('customer_unique_id', as_index=False)
    .agg(months_with_buys = ('year_month', 'nunique'))
)

In [None]:
tmp_df_months_with_buys.months_with_buys.value_counts(dropna=False)

Check key mismatches.

In [None]:
fron.analyze_join_keys(tmp_df_sales_agg, tmp_df_months_with_buys, "customer_unique_id", how='left')

Merge tables.

In [None]:
tmp_df_sales_agg = tmp_df_sales_agg.merge(tmp_df_months_with_buys, on='customer_unique_id', how='left')

**Consecutive Active Months**

Count streaks of ≥2 months.

In [None]:
tmp_df_max_months_with_buys = df_sales[['order_purchase_dt', 'customer_unique_id', 'order_id']]
tmp_df_max_months_with_buys['year_month'] = tmp_df_max_months_with_buys.order_purchase_dt.dt.to_period('M')
tmp_df_max_months_with_buys = tmp_df_max_months_with_buys.drop_duplicates(['customer_unique_id', 'year_month'])

In [None]:
tmp_df_max_months_with_buys = tmp_df_max_months_with_buys.sort_values(['customer_unique_id', 'year_month'])
tmp_df_max_months_with_buys['diff'] = tmp_df_max_months_with_buys.groupby('customer_unique_id')['year_month'].diff()
tmp_df_max_months_with_buys.dropna(subset='diff', inplace=True)
tmp_df_max_months_with_buys.fillna(0, inplace=True)
tmp_df_max_months_with_buys['is_diff_one_month'] = tmp_df_max_months_with_buys['diff'] == pd.offsets.MonthEnd(1)

In [None]:
def get_max_consecutive_repeats_plus_on(x):
    res = x.ne(x.shift()).cumsum()[x].value_counts().max() + 1
    return res

In [None]:
tmp_df_max_months_with_buys = (
    tmp_df_max_months_with_buys.groupby('customer_unique_id')
    .agg(max_consecutive_months_with_buys = ('is_diff_one_month', get_max_consecutive_repeats_plus_on))
    .dropna()
    .astype(int)
    .reset_index()
)

Check key mismatches.

In [None]:
fron.analyze_join_keys(tmp_df_sales_agg, tmp_df_max_months_with_buys, "customer_unique_id", how='left')

We have many users who made purchases in only one month, so they have a missing value in the new field. We will replace it with 1 after merging.


Merge tables.

In [None]:
tmp_df_sales_agg = tmp_df_sales_agg.merge(tmp_df_max_months_with_buys, on='customer_unique_id', how='left')

Replace NA with 1.

In [None]:
tmp_df_sales_agg.max_consecutive_months_with_buys = tmp_df_sales_agg.max_consecutive_months_with_buys.fillna(1).astype(int)
tmp_df_sales_agg.max_consecutive_months_with_buys.value_counts(dropna=False)

**Merge with df_customers**

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_customers, tmp_df_sales_agg, "customer_unique_id", how='left')

Missing values occur for customers with only canceled orders.

Merge tables.

In [None]:
df_customers = df_customers.merge(tmp_df_sales_agg, on='customer_unique_id', how='left')

#### 3.8.4.3 From Table  df_geolocations

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_customers, df_geolocations, left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')

As we already found out, there are customers whose prefixes are not present in the geolocation table. 

We need full prefixes only to calculate the distance between the customer and the seller. 

For geo-analysis, we will use the truncated prefixes. Let's check if there are any missing rows.


In [None]:
fron.analyze_join_keys(df_customers, df_geolocations, left_on='customer_zip_code_prefix_3_digits', right_on='geolocation_zip_code_prefix_3_digits', how='left')

With geo-analysis, there will be no problems. The issue is only with calculating the distance. 

We can try to fill in the coordinates based on the truncated prefixes and calculate the coordinates. 

However, this will clearly result in significant error, and we want to preserve accuracy. 

If we fill in these missing values, we will introduce more distortion into the data than gain any benefit. 

Given that there are few missing rows, we will simply leave them as is and analyze the distance without them.


Merge tables.

In [None]:
df_customers = df_customers.merge(df_geolocations, left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
df_customers.rename(columns={'geolocation_lat': 'lat_customer', 'geolocation_lng': 'lng_customer'}, inplace=True)

In [None]:
df_customers.drop(['geolocation_zip_code_prefix', 'geolocation_zip_code_prefix_3_digits'], axis=1, inplace=True)

Let’s look at the missing values.

In [None]:
df_customers.explore.detect_anomalies()

All missing values are expected. 

Missing values are for customers who did not make any successful purchases. 

We will replace the missing values in the measurements with 'Never Converted'.


In [None]:
df_customers['customer_top_purchase_weekdays'] = (
    df_customers['customer_top_purchase_weekdays']
    .cat.add_categories('Never Converted')
    .fillna('Never Converted')
)
df_customers['customer_payment_types'] = (
    df_customers['customer_payment_types']
    .cat.add_categories('Never Converted')
    .fillna('Never Converted')
)

In [None]:
df_customers['customer_top_product_categories'] = (
    df_customers['customer_top_product_categories']
    .cat.add_categories('Never Converted')
    .fillna('Never Converted')
)
df_customers['customer_top_general_product_categories'] = (
    df_customers['customer_top_general_product_categories']
    .cat.add_categories('Never Converted')
    .fillna('Never Converted')
)

#### 3.8.4.4 Customer Segmentation

Based on customer metrics, we will segment the customers.

**Monetary Value **

Examine quantiles in the total_customer_payment column.

In [None]:
df_customers.total_customer_payment.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Low value: payments up to 63 R$ inclusive
- Medium value: payments from 63 to 182 R$ inclusive
- High value: payments above 182 R$

In [None]:
labels = ['Low', 'Medium', 'High']
bins = [-np.inf, 63, 182, np.inf]

In [None]:
df_customers['value_segment'] = (
    df_customers.total_customer_payment.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Never Converted')
)

---

**Activity**

We identify the following segments:

- Non-converted: customers who didn't complete any successful purchases
- Core customers:
    - Completed 3 or more purchases
    - Time between first and last purchase is 60 days or more
- Potential core:
    - Completed 2 or more purchases
    - Time between first and last purchase is 30 days or less
- Short-term repeaters:
    - Completed more than 2 purchases
    - Time between first and last purchase is less than 30 days
- One-time buyers: completed only one purchase

In [None]:
now = df_sales.order_purchase_dt.max()

In [None]:
conditions = [
    df_customers['first_purchase_dt'].isna() # No Successful Purchases
    , (df_customers['buys_cnt'] >= 3) & (df_customers['from_first_to_last_days'] >= 60) # Core
    , (df_customers['buys_cnt'] >= 2) & (df_customers['from_first_to_last_days'] >= 30) # Potential Core
    , (df_customers['buys_cnt'] >= 2) & (df_customers['from_first_to_last_days'] < 30) # Short-Lived Repeat
    , (df_customers['buys_cnt'] == 1) # One-Time
]
choices = ['Never Converted', 'Core', 'Potential Core', 'Short-Lived Repeat', 'One Time']
df_customers['activity_segment'] = np.select(conditions, choices, default='Other')
df_customers['activity_segment'] = df_customers['activity_segment'].astype('category')
df_customers['activity_segment'].value_counts(dropna=False)

Examine quantiles in the avg_buys_diff_days column.

In [None]:
df_customers.avg_buys_diff_days.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Weekly 
- Monthly 
- Quarterly
- Semiannual 
- Annual 

In [None]:
labels = ['Weekly', 'Monthly', 'Quarterly', 'Semiannual', 'Annual']
bins = [-np.inf, 7, 30, 90, 180, np.inf]
df_customers['purchase_freq_segment'] = (
    df_customers.avg_buys_diff_days.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins)
)

Missing values require two-step replacement as they occur for both non-converted customers and those without any purchases.

In [None]:
mask_one_time = df_customers['buys_cnt'] == 1
mask_never_converted = df_customers['buys_cnt'].isna()

df_customers['purchase_freq_segment'] = (
    df_customers['purchase_freq_segment'].cat.add_categories(['Non-Repeating', 'Never Converted'])
)

df_customers.loc[mask_one_time, 'purchase_freq_segment'] = 'Non-Repeating'
df_customers.loc[mask_never_converted, 'purchase_freq_segment'] = 'Never Converted'
df_customers['purchase_freq_segment'].value_counts()

Examine quantiles in the from_first_to_second_days column.

In [None]:
df_customers.from_first_to_second_days.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Fast repeaters: repurchase within 14 days
- Medium repeaters: repurchase in 14-60 days
- Slow repeaters: repurchase after 60 days

In [None]:
labels = ['Fast Repeat', 'Medium Repeat', 'Slow Repeat']
bins = [-np.inf, 14, 60, np.inf]

In [None]:
df_customers['repeat_segment'] = (
    df_customers.from_first_to_second_days.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins)
)

We replace missing values.

In [None]:
df_customers['repeat_segment'] = (
    df_customers['repeat_segment'].cat.add_categories(['Non-Repeating', 'Never Converted'])
)

df_customers.loc[mask_one_time, 'repeat_segment'] = 'Non-Repeating'
df_customers.loc[mask_never_converted, 'repeat_segment'] = 'Never Converted'
df_customers['repeat_segment'].value_counts()

---

**Loyalty**

Examine quantiles in the customer_avg_reviews_score column.

In [None]:
df_customers.customer_avg_reviews_score.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Critics: average score below 3
- Neutral: average score 3-4
- Promoters: average score 5

In [None]:
labels = ['Critic', 'Neutral', 'Promoter']
bins = [-np.inf, 3, 5, np.inf]
df_customers['loyalty_segment'] = (
    df_customers.customer_avg_reviews_score.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Never Converted', right=False)
)

---

**Risk Segment**

Examine quantiles in the canceled_share column.

In [None]:
df_customers.canceled_share.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Reliable: 0% cancellation rate
- Risky: cancellation rate above 0%

In [None]:
labels = ['Reliable', 'Risky']
bins = [-np.inf, 0, np.inf]
df_customers['risk_segment'] = (
    df_customers.canceled_share.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins)
)

---

**Customer Behavioral Characteristics**

Examine quantiles in the purchase_weekend_share column.

In [None]:
df_customers.purchase_weekend_share.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Weekday buyers: 0% weekend purchases
- Weekend buyers: weekend purchases above 0%

In [None]:
labels = ['Weekday', 'Weekend']
bins = [-np.inf, 0, np.inf]
df_customers['weekday_segment'] = (
    df_customers.purchase_weekend_share.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Never Converted')
)

Examine quantiles in the installment_orders_share column.

In [None]:
df_customers.installment_orders_share.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Full payment: 0% installment orders
- Installment users: installment orders above 0%

In [None]:
labels = ['Full Pay', 'Installment']
bins = [-np.inf, 0, np.inf]
df_customers['installment_segment'] = (
    df_customers.installment_orders_share.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Never Converted')
)

---

**Order Characteristics**

Examine quantiles in the avg_products_cnt column.

In [None]:
df_customers.avg_products_cnt.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Single product buyers: average 1 product per order
- Multi-product buyers: average 1-2 products per order
- Bulk buyers: average more than 2 products per order

In [None]:
labels = ['Single Product', 'Multi Product', 'Bulk Buyer']
bins = [-np.inf, 1, 2, np.inf]
df_customers['products_cnt_segment'] = (
    df_customers.avg_products_cnt.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Never Converted')
)

Examine quantiles in the avg_order_total_weight_kg column.

In [None]:
df_customers.avg_order_total_weight_kg.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])

We create the following segments:

- Light orders: average up to 0.5 kg
- Medium orders: average 0.5-2.5 kg
- Heavy orders: average more than 2.5 kg

In [None]:
labels = ['Light', 'Medium', 'Heavy']
bins = [-np.inf, 0.5, 2.5, np.inf]
df_customers['weight_segment'] = (
    df_customers.avg_order_total_weight_kg.preproc
    .to_categorical(method='custom_bins', labels=labels, bins=bins, fill_na_value='Never Converted')
)

### 3.8.5 Enriching Table df_products

#### 3.8.5.1 From Table  df_orders

We will create the following metrics for each product:

- Count of canceled orders containing this product

In [None]:
tmp_df_products_canceled = (
    df_orders[lambda x: x.order_status == 'Canceled'][['order_id']]
    .merge(df_items, on='order_id', how='left')
    .groupby('product_id', as_index=False)
    .agg(
        product_canceled_orders_cnt = ('order_id', 'nunique')
    )
)

We will merge this with the main products dataframe.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_products, tmp_df_products_canceled, "product_id", how='left')

Missing values will appear for products that were not part of canceled orders.

Merge tables.

In [None]:
df_products = df_products.merge(tmp_df_products_canceled, on='product_id', how='left')

#### 3.8.5.2 From Table  df_sales

We will create the following additional metrics for each product:

- Count of orders containing this product
- Total units sold of this product
- Total sales value for this product
- Average quantity of this product per order
- Average share of this product in order by quantity
- Average share of this product in order by value
- Average price of product over all time
- Maximum price of product
- Minimum price of product
- Price range

In [None]:
tmp_df_products = (
    df_sales[['order_id']].merge(df_items, on='order_id', how='left')
    .groupby('product_id', as_index=False)
    .agg(
        product_sales_cnt = ('order_id', 'nunique')
        , total_units_sold = ('order_item_id', 'count')
        , total_sales_amount = ('price', 'sum')
        , avg_price = ('price', 'mean')
        , min_price = ('price', 'min')
        , max_price = ('price', 'max')
    )
)
tmp_df_products['price_range'] = tmp_df_products['max_price'] - tmp_df_products['min_price']

We will calculate average shares in orders.

In [None]:
tmp_df_products_share = (df_sales[['order_id']].merge(df_items, on='order_id', how='left')
                      .groupby(['order_id', 'product_id'], as_index=False)
                      .agg(
                          product_qty = ('order_item_id', 'count')
                          , product_total_price = ('price', 'sum')
                      )
)
tmp_df_products_share['products_cnt'] = tmp_df_products_share.groupby('order_id').product_qty.transform('sum')
tmp_df_products_share['order_total_price'] = tmp_df_products_share.groupby('order_id').product_total_price.transform('sum')
tmp_df_products_share['product_qty_share_per_order'] = tmp_df_products_share['product_qty'] / tmp_df_products_share['products_cnt']
tmp_df_products_share['order_total_price_share_per_order'] = tmp_df_products_share['product_total_price'] / tmp_df_products_share['order_total_price']

In [None]:
tmp_df_products_share = (tmp_df_products_share.groupby('product_id', as_index=False)
                      .agg(
                          avg_product_qty_per_order = ('product_qty', 'mean')
                          , avg_product_qty_share_per_order = ('product_qty_share_per_order', 'mean')
                          , avg_order_total_price_share_per_order = ('order_total_price_share_per_order', 'mean')
                      )
)

Merge tables.

Check key mismatches.

In [None]:
fron.analyze_join_keys(tmp_df_products, tmp_df_products_share, "product_id", how='left')

Merge tables.

In [None]:
tmp_df_products = tmp_df_products.merge(tmp_df_products_share, on='product_id', how='left')

We will merge this with the main products dataframe.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_products, tmp_df_products, "product_id", how='left')

Missing values will appear for products that were never sold.

Merge tables.

In [None]:
df_products = df_products.merge(tmp_df_products, on='product_id', how='left')

Let’s look at the missing values.

In [None]:
df_products.explore.detect_anomalies()

All missing values are expected.

### 3.8.6 Enriching Table df_sellers

#### 3.8.6.1 From Tables df_items and df_products

From Table `df_items` we will select only sales.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_items, df_sales[['order_id']], "order_id", how='inner')

In [None]:
tmp_df_items = df_items.merge(df_sales[['order_id']], on='order_id', how='inner')

We will add product information.

Check key mismatches.

In [None]:
fron.analyze_join_keys(tmp_df_items, df_products, "product_id", how='left')

Merge tables.

In [None]:
tmp_df_items_prods = tmp_df_items.merge(df_products, on='product_id', how='left')

We will create the following metrics for each seller:

- Total products sold
- Count of unique products sold
- Number of orders
- Total sales value
- Average carrier handoff delay
- Average number of items per order
- Average order value
- Average item price
- Average product weight

In [None]:
tmp_df_items_prods_agg_1 = (
    tmp_df_items_prods.groupby('seller_id', as_index=False)
    .agg(
        products_cnt = ('product_id', 'count')
        , unique_products_cnt = ('product_id', 'nunique')
        , orders_cnt = ('order_id', 'nunique')
        , revenue = ('price', 'sum')
        , avg_carrier_delivery_delay_days = ('carrier_delivery_delay_days', 'mean')
    )
)

In [None]:
tmp_df_items_prods_agg_2 = (
    tmp_df_items_prods.groupby(['seller_id', 'order_id'], as_index=False)
    .agg(
        products_cnt = ('product_id', 'count')
        , order_total_price = ('price', 'sum')
        , avg_product_price = ('price', 'mean')
    )
    .groupby('seller_id', as_index=False)
    .agg(
        avg_prouducts_cnt = ('products_cnt', 'mean')
        , avg_order_total_price = ('order_total_price', 'mean')
        , avg_product_price = ('avg_product_price', 'mean')
    )
)

Important note for average weight calculation:

- We must remove duplicates to calculate averages based on unique products only.

In [None]:
tmp_df_items_prods_agg_3 = (
     tmp_df_items_prods.drop_duplicates(subset = ['seller_id', 'product_id'])
     .groupby('seller_id', as_index=False)
     .agg(
          avg_product_weight_kg = ('product_weight_g', 'mean')
     )
)

In [None]:
tmp_df_items_prods_agg_3['avg_product_weight_kg'] = (tmp_df_items_prods_agg_3['avg_product_weight_kg'] / 1000).round(2)

We will merge intermediate dataframes.

In [None]:
fron.analyze_join_keys(tmp_df_items_prods_agg_1, tmp_df_items_prods_agg_2, "seller_id", how='left')

In [None]:
tmp_df_items_prods_agg = tmp_df_items_prods_agg_1.merge(tmp_df_items_prods_agg_2, on='seller_id', how='left')

In [None]:
fron.analyze_join_keys(tmp_df_items_prods_agg, tmp_df_items_prods_agg_3, "seller_id", how='left')

In [None]:
tmp_df_items_prods_agg = tmp_df_items_prods_agg.merge(tmp_df_items_prods_agg_3, on='seller_id', how='left')

We will add new fields to df_sellers.

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_sellers, tmp_df_items_prods_agg, "seller_id", how='left')

Sellers whose products were never purchased (or whose orders were canceled) will have missing values. This is expected.

Merge tables.

In [None]:
df_sellers = df_sellers.merge(tmp_df_items_prods_agg, on='seller_id', how='left')

#### 3.8.6.2 From Table  df_geolocations

Check key mismatches.

In [None]:
fron.analyze_join_keys(df_sellers, df_geolocations, left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')

As previously established, some sellers' zip prefixes are missing from the geolocation table.

We will check using truncated prefixes.

In [None]:
fron.analyze_join_keys(df_sellers, df_geolocations, left_on='seller_zip_code_prefix_3_digits', right_on='geolocation_zip_code_prefix_3_digits', how='left')

We will handle this the same way as with customers:

- Keep as-is for geo-analysis (no missing data)
- Maintain precision for distance calculations (accept missing values)
- Use left join to preserve all sellers

In [None]:
df_sellers = df_sellers.merge(df_geolocations, left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
df_sellers.rename(columns={'geolocation_lat': 'lat_seller', 'geolocation_lng': 'lng_seller'}, inplace=True)

In [None]:
df_sellers.drop(['geolocation_zip_code_prefix', 'geolocation_zip_code_prefix_3_digits'], axis=1, inplace=True)

Let’s look at the missing values.

In [None]:
df_sellers.explore.detect_anomalies()

We will clear temporary variables from memory.

In [None]:
for var_name in list(globals().keys()):
    if var_name.startswith('tmp_'):
        del globals()[var_name]

<h2 id="3-9"> 3.9 Intermediate Conclusion</h2>

Key preprocessing steps completed:

- Replaced zero values in payment_installments with 1 (mode and median)
- Removed complete duplicate rows from df_geolocations
- Averaged coordinates by geolocation_zip_code_prefix
- Replaced missing product dimensions/weights with median values by product category
- Replaced missing photo counts with 1 (mode and median)
- Replaced missing product category names with 'unknown'
- Converted product dimensions/weights to integer type

New variables created:

- Time-related metrics:
    - Order processing time
    - Total delivery time
    - Carrier delivery time
    - Difference between actual and estimated delivery times
- Review response time
- Total product cost (including shipping)
- Product volume
- Weight-to-volume ratio
- Delivery time categories (Fast/Medium/Slow)
- Review character length
- Average review score per order

Data filtering:

- Trimmed order data to January 2017 - August 2018
- Created order cancellation flag
- Orders present in orders table but missing from items table were either canceled or unavailable

Data integration:

- Merged datasets and created analysis-ready dataframes
- Calculated buyer-seller distances

All missing values were expected and properly handled according to their business context.

<h1 id="4"> 4 Data Analysis</h1>

<h2 id="4-1"> 4.1 Creating Analysis Class</h2>

We'll establish labels and category ordering for our visualizations, then integrate these specifications into our dataframes.

In [None]:
base_labels = dict(
    # df_orders
    year = 'Year'
    , purchase_year = 'Year'
    , purchase_month = 'Month'
    , purchase_season = 'Season'
    , purchase_weekday = 'Day of Week'
    , purchase_day_type = 'Day Type'
    , purchase_time_of_day = 'Time of Day'
    , purchase_hour = 'Hour'
    , order_status = 'Order Status'
    , is_delayed = 'Delivery Delay Status'
    , is_canceled = 'Order Cancellation Status'
    , is_delivered = 'Delivery Status'
    , delivery_time_days_cat = 'Delivery Time Category'
    , delivery_issue_reason = 'Delivery Issue Reason'
    , is_purchase = 'Purchase Status'
        # from df_paymetns
    , order_has_installment = 'Installment Status'
    , order_total_payment_cat = 'Order Payment Category'
    , order_payment_types = 'Order Payment Types'
        # from df_items and df_products
    , order_is_free_shipping = 'Free Shipping Status'
    , order_general_product_categories = 'General Product Categories'
    , order_product_categories = 'Product Categories'
    , order_total_weight_cat = 'Order Weight Category'
    , order_total_volume_cat = 'Order Volume Category'
        # from df_reviews
    , order_avg_reviews_score = 'Order Review Score'
    , order_review_sentiment = 'Order Review Sentiment'
    # df_sales
    , sale_is_customer_first_purchase = 'First-Time Purchase'
    , sale_is_customer_first_purchase_month = 'First Purchase Month'
    # df_customers
    , customer_state = 'Customer State'
    , customer_city = 'Customer City'
    , customer_top_purchase_weekdays = 'Top Purchase Weekdays'
    , customer_payment_types = 'Payment Methods'
    , customer_top_product_categories = 'Top Product Categories'
    , customer_top_general_product_categories = 'Top General Product Categories'       
    , activity_segment = 'Activity Segment'
    , value_segment = 'Value Segment'
    , purchase_freq_segment = 'Purchase Frequency Segment'
    , repeat_segment = 'Repeat Segment'
    , loyalty_segment = 'Loyalty Segment'
    , risk_segment = 'Risk Segment'
    , weekday_segment = 'Weekday Segment'
    , installment_segment = 'Installment Segment'
    , products_cnt_segment = 'Products Count Segment'
    , weight_segment = 'Weight Segment'    
    # df_payments
    , has_installments = 'Installment Status'
    , payment_type = 'Payment Type'
    # df_products
    , general_product_category = 'General Product Category'
    , product_category = 'Product Category'
    # df_review
    , review_score = 'Review Score'
    , season_review = 'Season'
    , review_day_type = 'Day Type'
    , review_creation_weekday = 'Day of Week'
    # df_sellers
    , seller_state = 'Seller State'
    , seller_city = 'Seller City'
    # for all tables
    , day_of_month = 'Day of Month'
)

In [None]:
base_category_orders = dict(
    is_purchase = ['Purchase', 'Not Purchase']
    , purchase_year = ['2017', '2018']
    , purchase_season = ['Spring', 'Summer', 'Autumn', 'Winter']
    , purchase_month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    , purchase_weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    , purchase_day_type = ['Weekday', 'Weekend']
    , purchase_time_of_day = ['Morning', 'Afternoon', 'Evening', 'Night']
    , purchase_hour = list(map(str, range(24)))
    , season_review = ['Spring', 'Summer', 'Autumn', 'Winter']
    , review_day_type = ['Weekday', 'Weekend']
    , review_creation_weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']    
    , order_avg_reviews_score=list(map(str, range(1, 6)))
    , review_score=list(map(str, range(1, 6)))
    , order_review_sentiment = ['Positive', 'Neutral', 'Negative']
    , is_delayed = ['Delayed', 'Not Delayed']
    , is_delivered = ['Delivered', 'Not Delivered']
    , order_status = ['Delivered', 'Shipped', 'Processing', 'Unavailable', 'Canceled', 'Invoiced', 'Approved', 'Created']
    , delivery_issue_reason = ['Service Issue', 'Customer Issue', 'No Issues']
    , delivery_time_days_cat = ['Fast', 'Medium', 'Long']
    , order_total_payment_cat =  ['Cheap', 'Medium', 'Expensive']
    , order_total_weight_cat = ['Light', 'Medium', 'Heavy']
    , order_total_volume_cat = ['Small', 'Medium', 'Large']
    , order_has_installment = ['Has Installments', 'No Installments']
    , has_installments = ['Has Installments', 'No Installments']
    , order_is_free_shipping = ['Free Shipping', 'Paid Shipping'] 
    , day_of_month = list(map(str, range(31)))
    # segments
    , value_segment = ['Low', 'Medium', 'High']
    , activity_segment = ['Core', 'Potential Core', 'Short-Lived Repeat', 'One Time', 'Never Converted']
    , purchase_freq_segment = ['Weekly', 'Monthly', 'Quarterly', 'Semiannual', 'Annual']
    , repeat_segment = ['Fast Repeat', 'Medium Repeat', 'Slow Repeat']
    , loyalty_segment = ['Critic', 'Neutral', 'Promoter']
    , risk_segment = ['Reliable', 'Risky']
    , weekday_segment = ['Weekday', 'Weekend']
    , installment_segment = ['Full Pay', 'Installment']
    , products_cnt_segment = ['Single Product', 'Multi Product', 'Bulk Buyer']
    , weight_segment = ['Light', 'Medium', 'Heavy']
)

In [None]:
for df in [df_orders, df_sales, df_items, df_customers, df_products, df_sellers, df_reviews, df_payments]:
    df.viz.update_plotly_settings(
        labels=base_labels
        , category_orders=base_category_orders
    )

We'll develop a dedicated class for generating standardized visualizations.

In [None]:
class PlotBuilder:
    df = None
    time_column = None
    time_column_label = None
    metric = None
    metric_label = None # for axis label
    metric_label_for_distribution = None # if metric_lable is aggregated
    agg_func = None
    freq = None
    title_base = None
    norm_by = None
    cur_dim = None
    dimensions = base_labels
    axis_sort_order = None
    text_auto = None
    plotly_kwargs = {}
    update_fig = {}
    block_save_fig_for_slides = True
    slide_path = 'for_slides/svg/'
    slide_img_fmt = 'svg'

    @classmethod
    def reset_configure(cls) -> None:
        """Reset global settings"""
        cls.df = None
        cls.time_column = None
        cls.time_column_label = None
        cls.metric = None
        cls.metric_label = None 
        cls.metric_label_for_distribution = None 
        cls.agg_func = None
        cls.freq = None
        cls.title_base = None
        cls.norm_by = None
        cls.cur_dim = None
        cls.dimensions = base_labels
        cls.axis_sort_order = None
        cls.text_auto = None
        cls.plotly_kwargs = {}
        cls.update_fig = {}
        
    @classmethod
    def metric_info(cls, freq=None, agg_func=None, **kwargs):
        """
        Display distribution information and statistics for the metric column.
        
        Parameters:
        -----------
        freq : str, optional
            The time frequency for aggregation (e.g., 'ME' for month, 'W' for week, 'D' for day).
            If not provided, the analysis will be performed on raw, non-aggregated data.
        agg_func : str or function, optional
            The aggregation function to apply when freq is specified (e.g., 'mean', 'sum', 'count').
            If not provided, uses the class's default agg_func.
        **kwargs : dict
            Additional arguments to pass to the underlying visualization function.
            Commonly used to customize labels, titles, or plot parameters.
            
        Notes:
        ------
        - When freq is specified, each data point represents an aggregated value
          for the given time period, providing insights into temporal patterns.
        - Without freq, the analysis shows the raw distribution of individual values.
        - The method automatically handles axis labels and titles, but these can be
          overridden through kwargs if needed.
        """
        
        metric_label_for_title = cls.get_metric_label_for_title_for_metric_info()
        
        if freq:
            period_map = {'ME': 'Month', 'W': 'Week', 'D': 'Day'}
            
            if not agg_func:
                agg_func = cls.agg_func
            agg_metric = f'{agg_func}_{cls.metric}_per_{period_map[freq].lower()}' 
            # Aggregate data by specified frequency
            df_metric_per_period = (
                cls.df.groupby(pd.Grouper(key=cls.time_column, freq=freq), observed=False)[cls.metric]
                .agg(agg_func)
                .to_frame(agg_metric)
            )
            
            # Enhance title with aggregation info
            metric_label_for_title += f' per {period_map.get(freq, freq)}'
            
            # Set default labels if none provided
            if 'labels' not in kwargs:
                kwargs['labels'] = {
                    agg_metric: cls.metric_label_for_distribution if cls.metric_label_for_distribution else cls.metric_label
                }
            # Set default title if none provided
            if 'title' not in kwargs:
                kwargs['title'] = f'Distribution of {metric_label_for_title}'
            
            return df_metric_per_period[agg_metric].explore.info(**kwargs)
        else:
            # Handle non-aggregated case
            if 'labels' not in kwargs:
                kwargs['labels'] = {
                    cls.metric: cls.metric_label_for_distribution if cls.metric_label_for_distribution else cls.metric_label
                }
            
            if 'title' not in kwargs:
                kwargs['title'] = f'Distribution of {metric_label_for_title}'
            
            return cls.df[cls.metric].explore.info(**kwargs)
    
    @classmethod
    def get_metric_label_for_title_for_metric_info(cls):
        """
        Helper method to get the appropriate metric label for titles.
        """
        if cls.metric_label_for_distribution:
            label_source = cls.metric_label_for_distribution
        else:
            label_source = cls.metric_label
            
        # Use only the first part if multiple labels are comma-separated
        return label_source.split(',')[0] if ',' in label_source else label_source
    
    @classmethod
    def metric_top(cls, id_column='order_id', n=10, freq=None, agg_func=None) -> pd.DataFrame:
        """
        Display top n entries based on the metric, with optional temporal aggregation.
        
        Parameters:
        -----------
        id_column : str, optional (default='order_id')
            The column name to display as identifier along with the metric.
        n : int, optional (default=10)
            Number of top entries to return.
        freq : str, optional
            The time frequency for aggregation (e.g., 'ME' for month, 'W' for week, 'D' for day).
            If not provided, the analysis will be performed on raw, non-aggregated data.
        agg_func : str or function, optional
            The aggregation function to apply when freq is specified (e.g., 'mean', 'sum', 'count').
            If not provided, uses the class's default agg_func.
            
        Returns:
        --------
        pd.DataFrame
            DataFrame containing the top n entries with the specified identifier column and metric values.
            When aggregation is applied, returns top time periods with aggregated metric values.
            
        Notes:
        ------
        - Without aggregation: shows top individual records sorted by raw metric values.
        - With aggregation: shows top time periods sorted by aggregated metric values.
        - The method preserves the original class metric labels for proper display.
        - For aggregated results, the identifier column will be the time period.
        """
        
        if freq:
            # Handle aggregated case
            period_map = {'ME': 'Month', 'W': 'Week', 'D': 'Day'}
            
            if not agg_func:
                agg_func = cls.agg_func
            
            # Aggregate data by specified frequency
            df_agg = (
                cls.df.groupby(pd.Grouper(key=cls.time_column, freq=freq), observed=False)[cls.metric]
                .agg(agg_func)
            )
            
            return df_agg.sort_values(ascending=False).head(n).to_frame()
        else:
            # Handle non-aggregated case
            return cls.df.set_index(id_column)[cls.metric].sort_values(ascending=False).head(n).to_frame()

    @classmethod
    def get_dim(cls, print_by_chunk=True) -> Union[None, list]:
        """
        Returns a list of specified measurements
        You can get a line in the form of a line to conveniently copy and immediately create a variable.
        And you can just in the form of a list
        """
        if print_by_chunk:
            dims = list(cls.dimensions.keys())
            result = "["
            for i, item in enumerate(dims):
                result += f"'{item}'"
                if i < len(dims) - 1:
                    result += ", "
                if (i + 1) % 5 == 0:
                    result += "\n"
            result += "]"
            print(result)
        else:
            return list(cls.dimensions.keys())

    @classmethod
    def check_cur_dim(cls, kwargs) -> None:
        """
        Helps not to indicate the name of the measurements when you need to sort out all the excessions
        Also displays the line with the name of the section for the report and the name of the measurement itself
        """
        if 'x' in kwargs and isinstance(kwargs['x'], int):
            kwargs['x'] = cls.cur_dim[kwargs['x']]
            res = '**By ' + cls.dimensions[kwargs['x']] + '**'
            print(res)
            print('x: ', kwargs['x'])
        if 'y' in kwargs and isinstance(kwargs['y'], int):
            kwargs['y'] = cls.cur_dim[kwargs['y']]
            res = '**By ' + cls.dimensions[kwargs['y']] + '**'
            print(res)
            print('y: ', kwargs['y'])
        if 'color' in kwargs and isinstance(kwargs['color'], int):
            kwargs['color'] = cls.cur_dim[kwargs['color']]
            res = '**By ' + cls.dimensions[kwargs['color']] + '**'
            print(res)
            print('color: ', kwargs['color'])
        if 'cat1' in kwargs and isinstance(kwargs['cat1'], int):
            kwargs['cat1'] = cls.cur_dim[kwargs['cat1']]
            res = '**By ' + cls.dimensions[kwargs['cat1']] + '**'
            print(res)
            print('cat1: ', kwargs['cat1'])
        if 'cat2' in kwargs and isinstance(kwargs['cat2'], int):
            kwargs['cat2'] = cls.cur_dim[kwargs['cat2']]
            res = '**By ' + cls.dimensions[kwargs['cat2']] + '**'
            print(res)
            print('cat2: ', kwargs['cat2'])

    @classmethod
    def configure(cls, **kwargs) -> None:
        """Set global settings"""
        cls.reset_configure()
        for key, value in kwargs.items():
            if hasattr(cls, key):
                setattr(cls, key, value)
            else:
                raise AttributeError(f"Invalid config parameter: {key}")
            
    @classmethod
    def to_slide(cls, fig: go.Figure, title_postfix: str = None):
        if not cls.block_save_fig_for_slides:
            title = fig.layout.title.text
            if isinstance(title_postfix, str):
                title += title_postfix
            fig.write_image(f"{cls.slide_path}{title}.{cls.slide_img_fmt}")
        
    @classmethod
    def line(cls, **kwargs) -> go.Figure:
        """Create line plot"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'line')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'line')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'line')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.line(**kwargs)
        else:
            fig = cls.df.viz.line(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'line')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def bar_groupby(cls, **kwargs) -> go.Figure:
        """Create bar plot with groupby"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'bar_groupby')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'bar_groupby')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'bar_groupby')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.bar(**kwargs)
        else:
            fig = cls.df.viz.bar(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'bar_groupby')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def line_resample(cls, **kwargs) -> go.Figure:
        """Create line plot with resample"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'line_resample')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'line_resample')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'line_resample')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.line(**kwargs)
        else:
            fig = cls.df.viz.line(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'line')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def area_resample(cls, **kwargs) -> go.Figure:
        """Create line plot with resample"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'line_resample')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'line_resample')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'line_resample')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.area(**kwargs)
        else:
            fig = cls.df.viz.area(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'area')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def heatmap(cls, **kwargs) -> go.Figure:
        """Create heatmap plot"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'heatmap')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'heatmap')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'heatmap')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.heatmap(**kwargs)
        else:
            fig = cls.df.viz.heatmap(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'heatmap')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def pie_bar(cls, **kwargs) -> go.Figure:
        """Create pie_bar plot"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'pie_bar')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'pie_bar')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'pie_bar')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.pie_bar(**kwargs)
        else:
            fig = cls.df.viz.pie_bar(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'pie_bar')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def box(cls, **kwargs) -> go.Figure:
        """Create boxplot"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'box')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'box')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'box')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.box(**kwargs)
        else:
            fig = cls.df.viz.box(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'box')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def histogram(cls, **kwargs) -> go.Figure:
        """Create histogram plot"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'histogram')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'histogram')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'histogram')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.histogram(**kwargs)
        else:
            # Since order_avg_reviews_score is numerical, it is better to convert to string for plotly
            if kwargs.get('color') == 'order_avg_reviews_score':
                df = cls.df.copy()
                df['order_avg_reviews_score'] = cls.df['order_avg_reviews_score'].astype(str).astype('category')
            else:
                df = cls.df
            fig = df.viz.histogram(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'histogram')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def cat_compare(cls, **kwargs) -> go.Figure:
        """Create plots for compare categorical columns"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'cat_compare')
        if 'category_orders' not in kwargs:
            kwargs['category_orders']={kwargs.get('cat1'): 'descending', kwargs.get('cat2'): 'descending'}
            if cls.plotly_kwargs and 'category_orders' in cls.plotly_kwargs:
                kwargs['category_orders'].update(cls.plotly_kwargs['category_orders'])
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.cat_compare(**kwargs)
        else:
            fig = cls.df.viz.cat_compare(**kwargs)
        fig = fig
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig
    
    @classmethod
    def period_change(cls, **kwargs) -> go.Figure:
        """Plot period-over-period changes"""
        to_slide = False
        if 'to_slide' in kwargs:
            to_slide = kwargs['to_slide']
            kwargs.pop('to_slide')
        kwargs = cls._prepare_common_settings(kwargs, 'period_change')
        kwargs['labels'] = cls._prepare_labels(kwargs, 'period_change')
        if not kwargs.get('title'):
            kwargs['title'] = cls._prepare_title(kwargs, 'period_change')        
        kwargs = cls._prepare_common_settings(kwargs, 'period_change')
        if kwargs.get('data_frame') is not None:
            fig = kwargs.pop('data_frame').viz.period_change(**kwargs)
        else:
            fig = cls.df.viz.period_change(**kwargs)
        fig = cls._update_fig(fig, kwargs, 'period_change')
        if to_slide:
            cls.to_slide(fig, to_slide)
        return fig

    @classmethod
    def _prepare_labels(cls, kwargs: dict, graph_type: str) -> dict:
        """Labels preparation for graphs"""
        labels = dict()

        # Label for base metric
        if graph_type in ['line', 'bar_groupby', 'line_resample', 'pie_bar']:
            if cls.metric and cls.metric_label:
                labels[cls.metric] = cls.metric_label
        elif graph_type in ['box', 'histogram']:
            if cls.metric and cls.metric_label_for_distribution:
                labels[cls.metric] = cls.metric_label_for_distribution

        # labels for time_column
        if graph_type in ['line', 'line_resample', 'box', 'period_change']:
            if cls.time_column and cls.time_column_label:
                labels[cls.time_column] = cls.time_column_label

        # labels for specific graph_type
        if graph_type in ['heatmap']:
            if isinstance(kwargs['x'], pd.core.resample.TimeGrouper):
                labels['x'] = cls.time_column_label
            else:
                if kwargs['x'] not in cls.dimensions:
                    raise ValueError(f"{kwargs['x']} not in cls.dimensions")
                labels['x'] = cls.dimensions[kwargs['x']]
            if kwargs['y'] not in cls.dimensions:
                raise ValueError(f"{kwargs['y']} not in cls.dimensions")
            labels['y'] = cls.dimensions[kwargs['y']]
            labels['color'] = cls.metric_label
        if 'labels' in kwargs:
            labels.update(kwargs['labels'])
        return labels

    @classmethod
    def _prepare_title(cls, kwargs: dict, graph_type: str) -> str:
        """Title preparation for graphs"""
        # Determin axis_dimension
        if graph_type in ['bar_groupby', 'pie_bar'] or (graph_type == 'box' and kwargs.get('mode') != 'time_series'):
            if kwargs.get('y') == cls.metric:
                axis_dimension = kwargs['x']
            elif kwargs.get('x') == cls.metric:
                axis_dimension = kwargs['y']
            else:
                raise ValueError('Can not define axis_dimension')
            if graph_type != 'box' and cls.axis_sort_order and axis_dimension != 'purchase_season' and axis_dimension not in kwargs.get('category_orders', {}):
                axis_sort_order = cls.axis_sort_order
                # For bottom direction change sorting direction
                if kwargs.get('trim_top_n_direction') == 'bottom':
                    axis_sort_order = 'ascending'
                kwargs.setdefault('category_orders', {}).update({axis_dimension: axis_sort_order})
            if graph_type == 'box' and cls.agg_func and cls.axis_sort_order and axis_dimension != 'purchase_season' and axis_dimension not in kwargs.get('category_orders', {}):
                axis_sort_order = cls.axis_sort_order
                # For bottom direction change sorting direction
                if kwargs.get('trim_top_n_direction') == 'bottom':
                    axis_sort_order = 'ascending'
                kwargs.setdefault('category_orders', {}).update({axis_dimension: f'{cls.agg_func} {axis_sort_order}'})
            if axis_dimension not in cls.dimensions:
                raise ValueError(f'{axis_dimension} not in cls.dimensions')

        # Base part of titke
        # if metric label contain ',' cut that part of metric label
        if graph_type in ['box', 'histogram']:
            if cls.metric_label_for_distribution:
                metric_label_for_distribution = cls.metric_label_for_distribution if ',' not in cls.metric_label_for_distribution else cls.metric_label_for_distribution.split(',')[0]
            else:
                metric_label_for_distribution = cls.metric_label if ',' not in cls.metric_label else cls.metric_label.split(',')[0]
            if graph_type == 'box':
                title = f'Boxplots of {metric_label_for_distribution}'
            elif graph_type == 'histogram':
                title = f'Distribution of {metric_label_for_distribution}'
        elif graph_type not in ['period_change']:
            if cls.title_base:
                title = cls.title_base
            elif cls.metric_label:
                title = cls.metric_label if ',' not in cls.metric_label else cls.metric_label.split(',')[0]
            else:
                raise ValueError('For auto create title, title_base or metric_label must be define')

        # Color part of title
        if graph_type in ['line', 'bar_groupby', 'line_resample', 'pie_bar', 'box', 'histogram']:
            if not (graph_type in ['line', 'line_resample', 'histogram'] or (graph_type == 'box' and kwargs.get('mode') == 'time_series')):
                title += f' by {cls.dimensions[axis_dimension]}'
            if 'color' in kwargs:
                if kwargs['color'] not in cls.dimensions:
                    raise ValueError(f"{kwargs['color']} not in cls.dimensions")
                if graph_type in ['line', 'line_resample', 'histogram'] or (graph_type == 'box' and kwargs.get('mode') == 'time_series'):
                    title += f" by {cls.dimensions[kwargs['color']]}"
                else:
                    title += f" and {cls.dimensions[kwargs['color']]}"

        # Datetime part of title
        if graph_type in ['line', 'line_resample'] or (graph_type == 'box' and kwargs.get('mode') == 'time_series'):
            if 'freq' in kwargs:
                resample_freq_for_title = kwargs['freq']
            else:
                resample_freq_for_title = cls.freq
            if not resample_freq_for_title:
                raise ValueError('freq must be define')
            freq_map = {'h': 'Hour', 'D': 'Day', 'W': 'Week', 'ME': 'Month', 'M': 'Month'}
            if 'color' in kwargs:
                title += f' and {freq_map[resample_freq_for_title]}'
            else:
                title += f' by {freq_map[resample_freq_for_title]}'
        # Part for specific graph_type
        if graph_type in ['heatmap']:
            title += f" by {cls.dimensions[kwargs['x']]}"
            title += f" and {cls.dimensions[kwargs['y']]}"
        elif graph_type in ['histogram', 'box']:
            if 'lower_quantile' in kwargs or 'upper_quantile' in kwargs:
                quantile_for_title = ' ('
                if kwargs.get('mode') == 'dual_box_trim':
                    quantile_for_title += 'Right: '
                if 'lower_quantile' in kwargs:
                    quantile_for_title += f"from {kwargs['lower_quantile']} "
                if 'upper_quantile' in kwargs:
                    quantile_for_title += f"to {kwargs['upper_quantile']} "
                quantile_for_title += 'Quantile)'
                title += quantile_for_title
        if graph_type in ['period_change']:
            title_map = {
                'mom': 'Monthly Change in {metric}',
                'wow': 'Weekly Change in {metric}',
                'dod': 'Daily Change in {metric}',
                'yoy': 'Yearly Change in {metric}'
            }
            period = kwargs.get('period', 'mom')
            if not cls.metric_label:
                raise ValueError('metric_label must be define')
            metric_label = cls.metric_label if ',' not in cls.metric_label else cls.metric_label.split(',')[0]
            title = title_map[period].format(metric=metric_label)
        return title

    @classmethod
    def _prepare_common_settings(cls, kwargs: dict, graph_type: str) -> dict:
        """Preparation of general settings for graphs"""
        cls.check_cur_dim(kwargs)
        if graph_type not in ['heatmap', 'cat_compare']:
            kwargs.setdefault('hover_data', {}).update({cls.metric: ':.3f'})
        if graph_type in ['cat_compare']:
            if 'cat1' in kwargs and 'cat2' not in kwargs:
                kwargs['cat2'] = cls.metric
            elif 'cat2' in kwargs and 'cat1' not in kwargs:
                kwargs['cat1'] = cls.metric

        # top_n settings
        if graph_type in ['line', 'line_resample']:
            if (kwargs.get('color') in ['customer_state', 'customer_city', 'order_product_categories', 'order_general_product_categories', 'seller_state', 'seller_city']
                and 'trim_top_n_color' not in kwargs):
                kwargs['trim_top_n_color'] = 5

        if graph_type in ['bar_groupby', 'box']:
            if kwargs.get('y') in ['customer_state', 'customer_city', 'order_product_categories', 'order_general_product_categories',
                                   'product_category', 'general_product_category', 'seller_state', 'seller_city']:
                if 'trim_top_n_y' not in kwargs:
                    kwargs['trim_top_n_y'] = 15
                if 'height' not in kwargs:
                    kwargs['height'] = 500

        # aggregation
        if graph_type in ['bar_groupby', 'line_resample', 'pie_bar', 'heatmap']:
            kwargs.setdefault('agg_func', cls.agg_func)
            if graph_type != 'heatmap':
                kwargs.setdefault('agg_column', cls.metric)
        # normalization
        if graph_type in ['bar_groupby', 'pie_bar']:
            if cls.text_auto:
                kwargs.setdefault('text_auto', cls.text_auto)
            kwargs.setdefault('norm_by', cls.norm_by)
        # time freq
        if graph_type in ['line_resample']:
            kwargs.setdefault('freq', cls.freq)

        # settings for specific graph_type
        if graph_type in ['line', 'line_resample']:
            kwargs.setdefault('x', cls.time_column)
            kwargs.setdefault('y', cls.metric)

        elif graph_type in ['bar_groupby']:
            if 'x' not in kwargs and 'y' not in kwargs:
                raise ValueError('x or y must be define')
            if 'y' not in kwargs:
                kwargs.setdefault('y', cls.metric)
            elif 'x' not in kwargs:
                kwargs.setdefault('x', cls.metric)

        elif graph_type in ['heatmap']:
            kwargs.setdefault('do_pivot', True)
            kwargs.setdefault('z', cls.metric)
            kwargs['width'] = 1100
        elif graph_type in ['pie_bar']:
            kwargs.setdefault('hole', 0.5)
            if 'x' not in kwargs and 'y' not in kwargs:
                raise ValueError('x or y must be define')
            if 'y' not in kwargs:
                kwargs.setdefault('y', cls.metric)
            elif 'x' not in kwargs:
                kwargs.setdefault('x', cls.metric)
        elif graph_type in ['box']:
            if kwargs.get('mode') == 'time_series':
                kwargs.setdefault('x', cls.time_column)
                kwargs.setdefault('y', cls.metric)
            else:
                kwargs['show_dual'] = True
                kwargs['upper_quantile'] = 0.95

                if 'x' not in kwargs and 'y' not in kwargs:
                    raise ValueError('x or y must be define')
                if 'y' not in kwargs:
                    kwargs.setdefault('y', cls.metric)
                elif 'x' not in kwargs:
                    kwargs.setdefault('x', cls.metric)
        elif graph_type in ['histogram']:
            if 'x' in kwargs and isinstance(kwargs['x'], str) or 'color' in kwargs:
                kwargs.setdefault('show_hist', False)
                kwargs.setdefault('show_kde', True)
                kwargs.setdefault('mode', 'dual_box_trim')
                kwargs.setdefault('show_legend_title', True)
            else:
                kwargs.setdefault('mode', 'dual_hist_trim')
            kwargs.setdefault('x', cls.metric)
            kwargs.setdefault('upper_quantile', 0.95)
        elif graph_type in ['period_change']:
            kwargs.setdefault('metric_col', cls.metric)
            kwargs.setdefault('date_col', cls.time_column)
            kwargs.setdefault('agg_func', cls.agg_func)
            if cls.plotly_kwargs:
                kwargs.update(cls.plotly_kwargs)
        return kwargs
    
    @classmethod
    def _update_fig(cls, fig: go.Figure, kwargs: dict, graph_type: str) -> dict:
        update_fig = {}
        for param in cls.update_fig:
            if hasattr(fig.layout, param):
                update_fig[param] = cls.update_fig[param]
        fig.update_layout(**update_fig)
        return fig

pb = PlotBuilder

<h2 id="4-2"> 4.2 Time Series Analysis</h2>

### 4.2.1 Order Volume vs. Successful Purchases

**Order Status Distribution**

First, we examine order counts by status. Note that for purchase analysis, we'll focus solely on delivered orders since other statuses don't represent completed purchases.

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , order_id = 'Number of Orders'
    , order_status = 'Order Status'
))
df_orders.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , color='order_status'
    , agg_func='nunique'
    , freq='ME'
    , labels=labels
    , width=900
    , title='Number of Orders by Order Status and Month'
)

**Key Observations:**  

- Delivered orders consistently dominated all other statuses throughout the period.

Let's look without delivered.

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , order_id = 'Number of Orders'
    , order_status = 'Order Status'
))
df_orders[lambda x: x.order_status != 'Delivered'].viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , color='order_status'
    , agg_func='nunique'
    , freq='ME'
    , labels=labels
    , width=900
    , markers=True
    , title='Number of Orders by Order Status and Month (Without Delivered)'
)

**Key Observations:**  

- March-April 2018 saw a sharp spike in orders stuck in "shipped" status
- February and August 2018 peaks in "canceled" status
- November 2017 "unavailable" peak coincided with Black Friday promotions
- As identified during anomaly detection, most of these orders ultimately weren't delivered for various reasons

Let's compare the number of orders and sales.

In [None]:
tmp_df_orders_resampled = (
    df_orders.resample('ME', on='order_purchase_dt')['order_id']
    .nunique()
    .reset_index(name='Orders')
)
tmp_tmp_df_resampled = (
    df_sales.resample('ME', on='order_purchase_dt')['order_id']
    .nunique()
    .reset_index(name='Sales')
    .merge(tmp_df_orders_resampled, on='order_purchase_dt', how='inner')
)
tmp_tmp_df_resampled['sale_share'] = tmp_tmp_df_resampled['Sales'] / tmp_tmp_df_resampled['Orders']
tmp_tmp_df_resampled_melted = tmp_tmp_df_resampled.melt(id_vars='order_purchase_dt', value_vars=['Orders', 'Sales'], var_name='order_or_sale', value_name='count')

**By month**

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , count = 'Number of Orders'
    , order_or_sale = 'Orders or Sales'
))
tmp_tmp_df_resampled_melted.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , color='order_or_sale'
    , labels=labels
    , category_orders=dict(order_or_sale=['Orders', 'Sales'])
    , title='Number of Orders and Sales by Month'
)

**Key Observations:**  

- Stable purchase-to-order ratio maintained month-over-month

Let's look at conversion to success sale

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , sale_share = 'Conversion to Sale'
))
fig = tmp_tmp_df_resampled.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , labels=labels
    , title='Conversion to Success Sale by Month'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Consistent month-over-month improvement in successful purchase conversion

### 4.2.2 Share of Canceled Orders

In [None]:
pb.configure(
    time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'target_share'
    , metric_label = 'Share of Canceled Orders'
    , freq='ME'
)

**By month**

In [None]:
tmp_tmp_df_res = df_orders['order_status'].preproc.calc_target_category_share(
    target_category='Canceled'
    , group_columns=['order_purchase_dt']
    , resample_freq = 'ME'
)

In [None]:
pb.line(
    data_frame=tmp_tmp_df_res
    , to_slide=True
)

**Key Observations:**  

- Fluctuated between 0.2% and 1.2% across months

**By Review Score**

In [None]:
tmp_tmp_df_res = df_orders['order_status'].preproc.calc_target_category_share(
     target_category='Canceled'
    , group_columns=['order_purchase_dt', 'order_avg_reviews_score']
    , resample_freq = 'ME'
)

In [None]:
pb.line(
    data_frame=tmp_tmp_df_res
    , color='order_avg_reviews_score'
    , to_slide=True
)

**Key Observations:**  

- Orders with score 1 showed significantly higher cancelation rates
- Scores 4-5 maintained 0% cancelation rates for most months

In [None]:
del tmp_tmp_df_res

### 4.2.3 Number of Sales

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'order_id'
    , metric_label = 'Number of Sales'
    , agg_func = 'nunique'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- There was an anomalous peak in sales volume on November 24, 2017 - Black Friday  
- The number of sales steadily increased until November 2017, reaching a peak of approximately 7.3k orders per month. After that, sales stabilized and fluctuated in the range of 6-7k orders per month.


**Monthly Growth**

In [None]:
pb.period_change(
    period='mom'
    , to_slide=True
)

**Key Observations:**  

- The most significant decline in sales (more than 5% month-over-month) was observed in April, June, and December 2017, as well as in February and June 2018.
- Conversely, the most pronounced spikes in growth (exceeding 50%) were recorded in February, March, and May 2017, and in November 2018 (Black Friday). 
- This indicates strong demand volatility in specific months, possibly linked to seasonality or marketing activities.

**By Time of Day**

In [None]:
pb.line_resample(
    color='purchase_time_of_day'
    , to_slide=True
)

**Key Observations:**  

- Nighttime sales are on average lower than other times of day  
- The highest sales volume occurs in the evening  

**By Day of Week**

Analyzing trends by month would be incorrect, as one month might have 4 Mondays while another has 5.

Therefore, we'll examine weekly patterns.

In [None]:
pb.area_resample(color='purchase_weekday', freq='W', title='Number of Sales by Day of Week and Week')

In [None]:
pb.heatmap(x=pd.Grouper(key=pb.time_column, freq='W')
        , y='purchase_weekday'
        , text_auto=False
        , title='Number of Sales by Day of Week and Week'
)

**Key Observations:**  

- There's no significant difference in sales by day of week, though weekends show slightly lower volumes  
- The peak value occurred on Black Friday  

**By Weekday vs Weekend**

In [None]:
pb.line_resample(color='purchase_day_type')

**Key Observations:**  

- Weekday sales grew faster than weekend sales  

**By Review Score**

In [None]:
pb.line_resample(
    color='order_avg_reviews_score'
    , to_slide=True
)

**Key Observations:**  

- Orders with 5-star reviews significantly outnumber others each month  
- Orders with 2-star reviews are consistently the least common  
- 5-star orders continued growing in 2018 despite overall sales plateauing, due to declining 1-star orders  

**By Delivery Delay Status**

In [None]:
pb.line_resample(
    color='is_delayed'
    , to_slide=True
)

**Key Observations:**  

- Delayed orders aren't increasing proportionally with total orders - a positive trend  
- Peak months for delays: November 2017 (Black Friday) and March 2018  

**By Payment Category**

In [None]:
pb.line_resample(color='order_total_payment_cat')

**Key Observations:**  

- The overall sales trend remains consistent across cheap, medium, and expensive orders  

**By Order Weight Category**

In [None]:
pb.line_resample(color='order_total_weight_cat')

**Key Observations:**  

- The overall sales trend remains consistent across light, medium, and heavy orders  

**By Delivery Time Category**

In [None]:
pb.line_resample(
    color='delivery_time_days_cat'
    , to_slide=True
)

**Key Observations:**  

- November 2017 saw a sharp spike in long-delivery orders (likely Black Friday effect)  
- High volumes of long deliveries persisted until March 2018  
- By August 2018, long deliveries became less common than medium and fast ones  

**By Presence of Installment Payments**

In [None]:
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Before 2018, non-installment orders lagged behind installment ones, but volumes equalized in 2018  

**By Top Customer States**

In [None]:
pb.line_resample(
    color='customer_state'
    , to_slide=True
)

**Key Observations:**  

- São Paulo state consistently led in sales volume throughout the period  
- Unlike other states, São Paulo maintained stable monthly sales in 2018  
- Rio de Janeiro and Minas Gerais ranked second and third respectively  

**By Top Customer Cities**

In [None]:
pb.line_resample(
    color='customer_city'
    , to_slide=True
)

**Key Observations:**  

- São Paulo city consistently had the highest sales volume  
- Rio de Janeiro ranked second  
- Unlike other cities, São Paulo showed monthly sales growth in 2018  

### 4.2.4 Sales Amount

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_payment'
    , metric_label = 'Sales Amount, R$'
    , metric_label_for_distribution = 'Order Value, R$'
    , title_base = 'Sales Amount'
    , agg_func = 'sum'
    , freq='ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
pb.box(mode='time_series', freq='M', upper_quantile=0.95).show()
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- There was an anomalous peak in sales revenue on November 24, 2017 - Black Friday  
- Monthly sales revenue grew until 2018, then stabilized at 1-1.2 million R$ per month  

**Monthly Growth**

In [None]:
pb.period_change(
    period='mom'
    , to_slide=True
)

**Key Observations:**  

- Sales revenue dropped by more than 5% month-over-month in April, June, and December 2017, and February and June 2018  

**By Time of Day**

In [None]:
pb.line_resample(color='purchase_time_of_day', to_slide=True)

**Key Observations:**  

- Nighttime sales revenue is on average lower than other times  
- Highest revenue occurs in evenings and afternoons  

**By Day of Week**

In [None]:
pb.area_resample(freq='W', color='purchase_weekday', title='Sales Amount by Day of the Week and Week')

In [None]:
pb.heatmap(x=pd.Grouper(key=pb.time_column, freq='W')
        , y='purchase_weekday'
        , text_auto=False
        , title='Sales Amount by Day of the Week and Week'
)

**Key Observations:**  

- No significant difference in revenue by day of week, though weekends are slightly lower  

**By Weekday vs Weekend**

In [None]:
pb.line_resample(color='purchase_day_type')

**Key Observations:**  

- Weekday revenue grew faster than weekend revenue  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score', to_slide=True)

**Key Observations:**  

- Orders with 5-star reviews generate significantly more revenue each month  
- Orders with 2-star reviews consistently generate the least revenue  
- Black Friday saw the strongest revenue spikes for both 5-star and 1-star orders  
- 5-star order revenue continued growing in 2018 despite overall stagnation, partly due to declining 1-star order revenue  

**By Whether the Order is Delayed or Not**

In [None]:
pb.line_resample(color='is_delayed', to_slide=True)

**Key Observations:**  

- Peak months for delayed order revenue: November 2017 (Black Friday) and March 2018  

**By Payment Category**

In [None]:
pb.line_resample(color='order_total_payment_cat')

**Key Observations:**  

- The overall revenue trend remains consistent across cheap, medium, and expensive orders  

**By Order Weight Category**

In [None]:
pb.line_resample(color='order_total_weight_cat')

**Key Observations:**  

- The overall revenue trend remains consistent across light, medium, and heavy orders  

**By Delivery Time Category**

In [None]:
pb.line_resample(color='delivery_time_days_cat', to_slide=True)

**Key Observations:**  

- November 2017 saw a sharp spike in revenue from long-delivery orders (likely Black Friday effect)  
- High volumes of long deliveries persisted until April 2018 (unexplained by Black Friday)  
- The subsequent sharp decline in long deliveries wasn't matched by growth in other categories  
- This may explain why overall revenue stopped growing in 2018  

**By Presence of Installment Payments**

In [None]:
pb.line_resample(color='order_has_installment', to_slide=True)

**Key Observations:**  

- Installment orders consistently generated higher revenue than non-installment orders  

**By Top Customer States**

In [None]:
pb.line_resample(color='customer_state', to_slide=True)

**Key Observations:**  

- São Paulo state consistently led in sales revenue  
- Rio de Janeiro and Minas Gerais ranked second and third respectively  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city', to_slide=True)

**Key Observations:**  

- São Paulo city consistently generated the highest sales revenue  
- Rio de Janeiro ranked second  

**By Payment Type**

In [None]:
pb.line_resample(color='order_payment_types', to_slide=True)

**Key Observations:**  

- Credit card payments consistently generated the highest revenue, with boleto second  
- Debit card payment revenue grew from June 2018  

**By Product Category**

In [None]:
pb.line_resample(color='order_general_product_categories', to_slide=True)

**Key Observations:**  

- Electronics consistently generated the highest revenue, followed by furniture  
- 'Beauty/Health' and 'Home/Garden' categories continued growing in 2018 while others slowed or declined  

### 4.2.5 Average Order Value

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_payment'
    , metric_label = 'Average Order Value, R$'
    , metric_label_for_distribution = 'Order Value, R$'
    , title_base = 'Average Order Value'
    , agg_func = 'mean'
    , freq='ME'
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- The average order value fluctuates between 100-250 R$ daily  
- Black Friday didn't show a spike in average order value  
- Monthly average order value remains stable at 150-170 R$ without growth  

**Monthly Growth**

In [None]:
pb.period_change(period='mom', to_slide=True)

**Key Observations:**  

- Average order value dropped >5% month-over-month in May, July, November 2017 and August 2018  


**By Time of Day**

In [None]:
pb.line_resample(color='purchase_time_of_day')

**Key Observations:**  

- No significant difference in average order value by time of day  
- Nighttime averages are slightly lower  

**By Whether the Order is Delayed or Not**

In [None]:
pb.line_resample(color='is_delayed')

**Key Observations:**  

- June 2017 saw a major peak in average value for delayed orders  
- Smaller peaks occurred in April/December 2017 and January/June 2018  

**By Order Weight Category**

In [None]:
pb.line_resample(color='order_total_weight_cat')

**Key Observations:**  

- Heavy orders show more variability in average value over time  
- Consistently: heavy > medium > light order values  
- April-July 2017 saw significant decline in heavy order values, followed by fluctuating growth  


**By Delivery Time Category**

In [None]:
pb.line_resample(color='delivery_time_days_cat', to_slide=True)

**Key Observations:**  

- For most months, expensive items had longer delivery times  

**By Presence of Installment Payments**

In [None]:
pb.line_resample(color='order_has_installment', to_slide=True)

**Key Observations:**  

- Installment orders consistently show higher average values (logical as customers can afford more)  
- Installment order values fluctuate more over time  
- Pre-July 2017: steady decline in installment order values  
- Post-July 2017: fluctuating growth 

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score', to_slide=True)

**Key Observations:**  

- Orders with 1-star reviews typically had higher average values  
- Orders with 2-star reviews often had higher values than 3/4/5-star orders  
- Conclusion: lower ratings were more common for higher-value orders  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city')

**Key Observations:**  

- February 2017 saw a major spike in Brasília's average order value  
- Otherwise, top 5 cities show similar average values  

### 4.2.6 Number of Purchases per Customer per Week

In [None]:
tmp_df_res = (df_sales.groupby([pd.Grouper(key = 'order_purchase_dt', freq='ME'), 'customer_unique_id'])['order_id']
          .nunique()
          .reset_index(name='order_cnt')
)
tmp_df_res['weeks_in_month_cnt'] = tmp_df_res.order_purchase_dt.dt.days_in_month / 7
tmp_df_res['avg_orders_per_week'] = tmp_df_res['order_cnt'] / tmp_df_res['weeks_in_month_cnt']
tmp_df_res.drop(['order_cnt', 'weeks_in_month_cnt'], axis=1, inplace=True)
tmp_df_res.sort_values('avg_orders_per_week', ascending=False).head(10)

**Key Observations:**  

- User '12f5d6e1cbf93dafd9dcc19095df0b3d' had the highest weekly purchase frequency (January 2017)  

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , avg_orders_per_week = 'Average number of sales per week'
))
tmp_df_res.viz.line(
          x=labels.index[0]
          , y=labels.index[1]
          , agg_func='mean'
          , freq='ME'
          , labels=labels
          , title='Average number of sales per week by month'
)

**Key Observations:**  

- Average weekly purchases per customer remains stable at 0.23-0.25  

### 4.2.7 ARPPU

In [None]:
pb.configure(
    time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'arppu'
    , metric_label = 'ARPPU, R$'
    , title_base = 'ARPPU'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
tmp_df_res = (
    df_sales.resample('D', on='order_purchase_dt')
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
fig_days = pb.line(data_frame=tmp_df_res, title='ARPPU by day')

In [None]:
tmp_df_res = (
    df_sales.resample('ME', on='order_purchase_dt')
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
fig_days.show()
pb.line(data_frame=tmp_df_res, to_slide=True)

**Key Observations:**  

- ARPPU fluctuates daily between 100-250 R$  
- No Black Friday spike in ARPPU  
- Monthly ARPPU remains stable at 130-150 R$  

**By Time of Day**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'purchase_time_of_day'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
pb.line(data_frame=tmp_df_res, color='purchase_time_of_day')

**Key Observations:**  

- No significant ARPPU differences by time of day  
- Nighttime ARPPU slightly lower  

**By Review Score**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'order_avg_reviews_score'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

Since the review score does not have a categorical type, we might lose groups with no data after grouping. 

Let's restore the dates.

In [None]:
tmp_df_res = tmp_df_res.preproc.restore_full_index(
    date_cols='order_purchase_dt'
    , group_cols='order_avg_reviews_score'
    , freq='ME'
)

In [None]:
pb.line(data_frame=tmp_df_res, color='order_avg_reviews_score', to_slide=True)

**Key Observations:**  

- 1-star review orders typically had higher ARPPU  
- 2-star orders often had higher ARPPU than 3/4/5-star orders  

**By Whether the Order is Delayed or Not**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'is_delayed'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
pb.line(data_frame=tmp_df_res, color='is_delayed')

**Key Observations:**  

- June 2017 saw major ARPPU peak for delayed orders  
- Smaller peaks in April/December 2017 and January/June 2018  

**By Order Weight Category**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'order_total_weight_cat'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
pb.line(data_frame=tmp_df_res, color='order_total_weight_cat')

**Key Observations:**  

- Heavy orders show more ARPPU variability  
- Consistently: heavy > medium > light order ARPPU  
- April-July 2017: significant heavy order ARPPU decline, then fluctuating growth  

**By Delivery Time Category**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'delivery_time_days_cat'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
pb.line(data_frame=tmp_df_res, color='delivery_time_days_cat', to_slide=True)

**Key Observations:**  

- Higher ARPPU typically correlates with longer delivery times  

**By Presence of Installment Payments**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'order_has_installment'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
pb.line(data_frame=tmp_df_res, color='order_has_installment', to_slide=True)

**Key Observations:**  

- Installment orders consistently show higher ARPPU  
- More ARPPU variability for installment orders  
- Pre-July 2017: steady ARPPU decline for installments  
- Post-July 2017: fluctuating growth  

**By Top Customer Cities**

Since there are cities with very few sales, we will select the top 5 cities by sales volume.


In [None]:
top_cities = (
    df_sales.groupby('customer_city', observed=False)['order_id']
    .nunique()
    .nlargest(5)
    .index.tolist()
)

In [None]:
tmp_df_res = (
    df_sales[lambda x: x.customer_city.isin(top_cities)]
    .groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'customer_city'], observed=False)
    .agg({'customer_unique_id': 'nunique', 'total_products_price': 'sum'})
    .reset_index()
)
tmp_df_res['arppu'] = tmp_df_res['total_products_price'] / tmp_df_res['customer_unique_id']

In [None]:
pb.line(data_frame=tmp_df_res, color='customer_city')

**Key Observations:**  

- February 2017: major ARPPU spike in Brasília  
- Otherwise minimal differences among top 5 cities  

### 4.2.8 Number of Customers

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'customer_unique_id'
    , metric_label = 'Number of Customers'
    , agg_func = 'nunique'
    , freq='ME'
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- Black Friday (Nov 24, 2017) saw anomalous customer count peak  
- Customer growth continued until 2018, then stabilized at 6-7k monthly  

**Monthly Growth**

In [None]:
pb.period_change(period='mom', to_slide=True)

**Key Observations:**  

- Customer counts dropped >5% month-over-month in April/June/December 2017 and February/June 2018  

**By Time of Day**

In [None]:
pb.line_resample(color='purchase_time_of_day', to_slide=True)

**Key Observations:**  

- Fewer customers at night  
- Evening sees most customer activity  

**By Day of Week**

In [None]:
pb.area_resample(color='purchase_weekday', freq='W', title='Number of Sales by Day of Week and Week')

In [None]:
pb.heatmap(x=pd.Grouper(key=pb.time_column, freq='W')
        , y='purchase_weekday'
        , text_auto=False
        , title='Number of Sales by Day of Week and Week'
)

**Key Observations:**  

- Minimal weekday/weekend customer count differences  
- Weekends slightly lower  

**By Weekday vs Weekend**

In [None]:
pb.line_resample(color='purchase_day_type')

**Key Observations:**  

- Weekday customer growth outpaced weekends  

**By Top Customer States**

In [None]:
pb.line_resample(color='customer_state', to_slide=True)

**Key Observations:**  

- São Paulo consistently led in customer counts  
- Unlike other states, maintained 2018 customer levels  
- Rio de Janeiro and Minas Gerais ranked 2nd/3rd  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city', to_slide=True)

**Key Observations:**  

- São Paulo city consistently had most customers  
- Rio de Janeiro ranked second  
- Only São Paulo showed 2018 monthly growth  

### 4.2.9 Share of New Customers

In [None]:
pb.configure(
    time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'new_customers_share'
    , metric_label = 'Share of New Customers'
    , freq='ME'
    , update_fig={'yaxis': {'tickformat': '.1%'}}
)

**By Day and Month**

Since one customer could make multiple orders at the same time, we need to remove such duplicates.


In [None]:
tmp_df_res = (df_sales[['order_purchase_dt', 'customer_unique_id', 'sale_is_customer_first_purchase']].drop_duplicates()
              .resample('D', on='order_purchase_dt')
              .agg(
                  new_customers_cnt=('sale_is_customer_first_purchase', 'sum'),
                  all_customers_cnt=('customer_unique_id', 'nunique')
              )
              .reset_index()
)
tmp_df_res['new_customers_share'] = tmp_df_res['new_customers_cnt'] / tmp_df_res['all_customers_cnt']

In [None]:
fig_days = pb.line(data_frame=tmp_df_res, title='Share of New Customers by Day')

In [None]:
tmp_df_res = (df_sales[['order_purchase_dt', 'customer_unique_id', 'sale_is_customer_first_purchase']].drop_duplicates()
              .resample('ME', on='order_purchase_dt')
              .agg(
                  new_customers_cnt=('sale_is_customer_first_purchase', 'sum'),
                  all_customers_cnt=('customer_unique_id', 'nunique')
              )
              .reset_index()
)
tmp_df_res['new_customers_share'] = tmp_df_res['new_customers_cnt'] / tmp_df_res['all_customers_cnt']

In [None]:
fig_days.show()
pb.line(data_frame=tmp_df_res, to_slide=True)

**Key Observations:**  

- Daily new customer share never fell below 92%  
- Monthly new customer share gradually declined (still >97%)  
- Nearly all active customers are new  

**By Weekday vs Weekend**

In [None]:
tmp_df_res = (df_sales[['order_purchase_dt', 'customer_unique_id', 'sale_is_customer_first_purchase', 'purchase_day_type']].drop_duplicates()
              .groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'purchase_day_type'], observed=True)
               .agg(
                   new_customers_cnt=('sale_is_customer_first_purchase', 'sum'),
                   all_customers_cnt=('customer_unique_id', 'nunique')
               )
               .reset_index()
)
tmp_df_res['new_customers_share'] = tmp_df_res['new_customers_cnt'] / tmp_df_res['all_customers_cnt']

In [None]:
pb.line(data_frame=tmp_df_res, color='purchase_day_type')

**Key Observations:**  

- Weekend new customer share fluctuates more than weekdays  
- Weekends typically have lower new customer share 

**By Top Customer States**

In [None]:
top_states = (df_sales
              .groupby('customer_state', observed=False)['order_id']
              .nunique()
              .nlargest(5)
              .index.tolist()
)

In [None]:
tmp_df_res = (df_sales[lambda x: x.customer_state.isin(top_states)]
              [['order_purchase_dt', 'customer_unique_id', 'sale_is_customer_first_purchase', 'customer_state']].drop_duplicates()
              .groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'customer_state'], observed=False)
               .agg(
                   new_customers_cnt=('sale_is_customer_first_purchase', 'sum'),
                   all_customers_cnt=('customer_unique_id', 'nunique')
               )
               .reset_index()
)
tmp_df_res['new_customers_share'] = tmp_df_res['new_customers_cnt'] / tmp_df_res['all_customers_cnt']

In [None]:
pb.line(data_frame=tmp_df_res, color='customer_state')

**Key Observations:**  

- Minimal state-level differences in new customer share  
- Rio Grande do Sul showed more variability  

**By Top Customer Cities**

In [None]:
top_cities = (df_sales
              .groupby('customer_city', observed=False)['order_id']
              .nunique()
              .nlargest(5)
              .index.tolist()
)

In [None]:
tmp_df_res = (df_sales[lambda x: x.customer_city.isin(top_cities)]
              [['order_purchase_dt', 'customer_unique_id', 'sale_is_customer_first_purchase', 'customer_city']].drop_duplicates()
              .groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'customer_city'], observed=False)
               .agg(
                   new_customers_cnt=('sale_is_customer_first_purchase', 'sum'),
                   all_customers_cnt=('customer_unique_id', 'nunique')
               )
               .reset_index()
)
tmp_df_res['new_customers_share'] = tmp_df_res['new_customers_cnt'] / tmp_df_res['all_customers_cnt']

In [None]:
pb.line(data_frame=tmp_df_res, color='customer_city')

**Key Observations:**  

- São Paulo and Rio de Janeiro show less monthly fluctuation in new customer share  

### 4.2.10 Number of Sellers

In [None]:
tmp_df_sales_sellers = (
    df_sales.merge(df_items[['order_id', 'seller_id']], on='order_id', how='left')
    .merge(df_sellers[['seller_id', 'seller_state', 'seller_city']], on='seller_id', how='left')
)

In [None]:
pb.configure(
    df = tmp_df_sales_sellers
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'seller_id'
    , metric_label = 'Number of Sellers'
    , agg_func = 'nunique'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- Black Friday (Nov 24, 2017) saw anomalous seller count peak  
- Monthly seller counts show steady growth  

**By Time of Day**

In [None]:
pb.line_resample(color='purchase_time_of_day', to_slide=True)

**Key Observations:**  

- Fewer active sellers at night  
- Evening/afternoon see most seller activity  

**By Weekday vs Weekend**

In [None]:
pb.line_resample(color='purchase_day_type')

**Key Observations:**  

- Weekday seller growth outpaced weekends  

**By Top Seller States**

In [None]:
pb.line_resample(color='seller_state', to_slide=True)

**Key Observations:**  

- São Paulo consistently led in seller counts with strongest growth  
- Paraná and Minas Gerais ranked 2nd/3rd  
- All top 5 states showed steady seller growth  

**By Top Seller Cities**

In [None]:
pb.line_resample(color='seller_city', to_slide=True)

**Key Observations:**  

- São Paulo city consistently had most sellers with strongest growth  
- Curitiba ranked second  

### 4.2.11 Share of New Sellers

In [None]:
pb.configure(
    time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'new_sellers_share'
    , metric_label = 'Share of New Sellers'
    , freq='ME'
    , update_fig={'yaxis': {'tickformat': '.1%'}}
)

In [None]:
tmp_df_sales_sellers['seller_first_order_dt'] = tmp_df_sales_sellers.groupby('seller_id')['order_purchase_dt'].transform('min')
tmp_df_sales_sellers['is_seller_first_order'] = tmp_df_sales_sellers['order_purchase_dt'] == tmp_df_sales_sellers['seller_first_order_dt']

**By Day and Month**

Since one seller could appear in multiple orders at the same time in the first order, we need to remove such duplicates.


In [None]:
tmp_df_res = (tmp_df_sales_sellers[['order_purchase_dt', 'seller_id', 'is_seller_first_order']].drop_duplicates()
              .resample('D', on='order_purchase_dt')
              .agg(
                  new_sellers_cnt=('is_seller_first_order', 'sum'),
                  all_sellers_cnt=('seller_id', 'nunique')
              )
              .reset_index()
)
tmp_df_res['new_sellers_share'] = tmp_df_res['new_sellers_cnt'] / tmp_df_res['all_sellers_cnt']

In [None]:
fig_days = pb.line(data_frame=tmp_df_res
               , title='Share of New Sellers by Day'
)

In [None]:
tmp_df_res = (tmp_df_sales_sellers[['order_purchase_dt', 'seller_id', 'is_seller_first_order']].drop_duplicates()
              .resample('ME', on='order_purchase_dt')
              .agg(
                  new_sellers_cnt=('is_seller_first_order', 'sum'),
                  all_sellers_cnt=('seller_id', 'nunique')
              )
              .reset_index()
)
tmp_df_res['new_sellers_share'] = tmp_df_res['new_sellers_cnt'] / tmp_df_res['all_sellers_cnt']

In [None]:
fig_days.show()
pb.line(data_frame=tmp_df_res, to_slide=True)

**Key Observations:**  

- New seller share declined until June 2017  
- Stabilized at 10-20% thereafter  

**By Weekday vs Weekend**

In [None]:
tmp_df_res = (tmp_df_sales_sellers[['order_purchase_dt', 'seller_id', 'is_seller_first_order', 'purchase_day_type']].drop_duplicates()
              .groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'purchase_day_type'], observed=True)
              .agg(
                  new_sellers_cnt=('is_seller_first_order', 'sum'),
                  all_sellers_cnt=('seller_id', 'nunique')
              )
              .reset_index()
)
tmp_df_res['new_sellers_share'] = tmp_df_res['new_sellers_cnt'] / tmp_df_res['all_sellers_cnt']

In [None]:
pb.line(data_frame=tmp_df_res, color='purchase_day_type')

**Key Observations:**  

- Weekends consistently had lower new seller share than weekdays  

**By Top Seller States**

In [None]:
top_states = (tmp_df_sales_sellers
              .groupby('seller_state', observed=False)['order_id']
              .nunique()
              .nlargest(5)
              .index.tolist()
)

In [None]:
tmp_df_res = (tmp_df_sales_sellers[lambda x: x.seller_state.isin(top_states)]
              [['order_purchase_dt', 'seller_id', 'is_seller_first_order', 'seller_state']].drop_duplicates()
              .groupby([pd.Grouper(key='order_purchase_dt', freq='ME'), 'seller_state'], observed=False)
               .agg(
                  new_sellers_cnt=('is_seller_first_order', 'sum'),
                  all_sellers_cnt=('seller_id', 'nunique')
               )
               .reset_index()
)
tmp_df_res['new_sellers_share'] = tmp_df_res['new_sellers_cnt'] / tmp_df_res['all_sellers_cnt']

In [None]:
pb.line(data_frame=tmp_df_res, color='seller_state')

**Key Observations:**  

- Minimal state-level differences in new seller share  
- Minas Gerais often slightly lower  

In [None]:
del tmp_df_res

### 4.2.12 Ratio of Number of Sellers and Customers

**Active Sellers and Customers**

In [None]:
customers_cnt_all = (tmp_df_sales_sellers.resample('ME', on='order_purchase_dt')['customer_unique_id']
                 .nunique()
                 .to_frame()
)
sellers_cnt_all = (tmp_df_sales_sellers.resample('ME', on='order_purchase_dt')['seller_id']
                 .nunique()
                 .to_frame()
)
tmp_df_res = customers_cnt_all.merge(sellers_cnt_all, left_index=True, right_index=True).reset_index()
tmp_df_res['seller_customer_ratio'] = (tmp_df_res['seller_id'] / tmp_df_res['customer_unique_id']).round(2)

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , seller_customer_ratio = 'Ratio of Number of Sellers and Customers'
))
fig = tmp_df_res.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , labels=labels
    , title='Ratio of Number of Sellers and Customers by Month'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Pre-November 2017: customer growth outpaced sellers  
- Post-November 2017: reversed trend  

**New Sellers and Customers**

In [None]:
tmp_df_sales_sellers['first_order_customer_dt'] = tmp_df_sales_sellers.groupby('customer_unique_id')['order_purchase_dt'].transform('min')
tmp_df_sales_sellers['is_first_month_for_customer'] = (
    (tmp_df_sales_sellers['order_purchase_dt'].dt.month == tmp_df_sales_sellers['first_order_customer_dt'].dt.month) &
    (tmp_df_sales_sellers['order_purchase_dt'].dt.year == tmp_df_sales_sellers['first_order_customer_dt'].dt.year)
)
tmp_df_sales_sellers['first_order_seller_dt'] = tmp_df_sales_sellers.groupby('seller_id')['order_purchase_dt'].transform('min')
tmp_df_sales_sellers['is_first_month_for_seller'] = (
    (tmp_df_sales_sellers['order_purchase_dt'].dt.month == tmp_df_sales_sellers['first_order_seller_dt'].dt.month) &
    (tmp_df_sales_sellers['order_purchase_dt'].dt.year == tmp_df_sales_sellers['first_order_seller_dt'].dt.year)
)
customers_cnt_new = (tmp_df_sales_sellers[tmp_df_sales_sellers.is_first_month_for_customer].resample('ME', on='order_purchase_dt')['customer_unique_id']
                 .nunique()
                 .to_frame()
)
sellers_cnt_new = (tmp_df_sales_sellers[tmp_df_sales_sellers.is_first_month_for_seller].resample('ME', on='order_purchase_dt')['seller_id']
                 .nunique()
                 .to_frame()
)
tmp_df_res = customers_cnt_new.merge(sellers_cnt_new, left_index=True, right_index=True).reset_index()
tmp_df_res['seller_customer_ratio'] = (tmp_df_res['seller_id'] / tmp_df_res['customer_unique_id']).round(2)

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , seller_customer_ratio = 'Ratio of New Sellers and New Customers'
))
fig = tmp_df_res.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , labels=labels
    , title='Ratio of Number of New Sellers and Number of New Customers by Month'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Pre-July 2017: stronger new customer growth  
- Post-July 2017: similar growth rates for new customers/sellers  

### 4.2.13 Number of Orders per Customer

**By month**

In [None]:
tmp_df_res = (
    df_sales.groupby([pd.Grouper(key = 'order_purchase_dt', freq='ME'), 'customer_unique_id'])['order_id']
    .nunique()
    .reset_index()
)

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , order_id = 'Average Number of Orders per Customer'
))
fig = tmp_df_res.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , agg_func='mean'
    , freq='ME'          
    , labels=labels
    , title='Average Number of Orders per Customer by Month'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- The average number of orders per user fluctuates around 1 throughout the entire period.

### 4.2.14 Number of Reviews

In [None]:
pb.configure(
    df = df_reviews
    , time_column = 'review_creation_dt'
    , time_column_label = 'Date'
    , metric = 'review_id'
    , metric_label = 'Number of Reviews'
    , agg_func = 'nunique'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- Daily review counts fluctuate significantly with extreme drops  
- Reviews may not be recorded daily but in batches  
- Review counts grew steadily until 2018, then stabilized at 6-8k monthly  

**By Day of Week**

In [None]:
pb.area_resample(color='review_creation_weekday', freq='W', title='Number of Reviews by Day of Week and Week')

In [None]:
pb.heatmap(x=pd.Grouper(key=pb.time_column, freq='W')
        , y='review_creation_weekday'
        , text_auto=False
        , title='Number of Reviews by Day of Week and Week'
        , to_slide=True
)

**Key Observations:**  

- Significantly fewer reviews created on Mondays/Sundays  
- Minimal differences between other weekdays  

**By Weekday vs Weekend**

In [None]:
pb.line_resample(color='review_day_type')

**Key Observations:**  

- Workdays consistently generate more reviews than weekends  

**By Review Score**

In [None]:
pb.line_resample(color='review_score')

**Key Observations:**  

- 5-star reviews significantly outnumber others monthly  
- 2-star reviews are consistently the least common  
- More 1-star reviews than 2/3-star reviews  

### 4.2.15 Review Score

In [None]:
pb.configure(
    df = df_reviews
    , time_column = 'review_creation_dt'
    , time_column_label = 'Date'
    , metric = 'review_score'
    , metric_label = 'Average Review Score'      
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- Daily average review scores fluctuate wildly (3-5)  
- Monthly averages declined from August 2017 to March 2018, then spiked  

**By Weekday vs Weekend**

In [None]:
pb.line_resample(color='review_day_type', to_slide=True)

**Key Observations:**  

- Workdays consistently have higher average review scores than weekends  

### 4.2.16 NPS

For calculating NPS, we will divide customers into the following groups:
- Promoters: customers who gave a rating of 5
- Passive: customers who gave a rating of 4
- Detractors: customers who gave a rating of 1-3

Let's look at how NPS changed by month.

In [None]:
tmp_df_res = (
    df_reviews.pivot_table(index=pd.Grouper(key='review_creation_dt', freq='ME'), columns='review_score', values='review_id', aggfunc='nunique')
)
tmp_df_res['total_responses'] = tmp_df_res.sum(axis=1)
tmp_df_res['promoters'] = tmp_df_res[5]
tmp_df_res['detractors'] = tmp_df_res[1] + tmp_df_res[2] + tmp_df_res[3]
tmp_df_res['nps'] = (tmp_df_res['promoters'] - tmp_df_res['detractors']) * 100 / tmp_df_res['total_responses']
tmp_df_res.reset_index(inplace=True)

In [None]:
labels = pd.Series(dict(
    review_creation_dt = 'Date'
    , nps = 'NPS'
))
fig = tmp_df_res.viz.line(
    x=labels.index[0]
    , y=labels.index[1]
    , labels=labels
    , title='NPS by month'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- NPS remains satisfactory (0-49) throughout  
- Many neutral customers, few critical issues  
- Significant NPS drop in March 2018  

### 4.2.17 Freight Cost Ratio

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'freight_ratio'
    , metric_label = 'Average Freight Cost Ratio'
    , agg_func = 'mean'
    , freq = 'ME'
    , update_fig={'yaxis': {'tickformat': '.1%'}}
)

**By Day and Month**

In [None]:
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- Shipping cost share of order value remains stable (0.2-0.22)  

**By Order Weight Category**

In [None]:
pb.line_resample(color='order_total_weight_cat')

**Key Observations:**  

- Light items have higher shipping cost share than medium/heavy  
- Minimal difference between medium/heavy items  

**By Presence of Installment Payments**

In [None]:
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders consistently have lower shipping cost share  

**By Top Customer States**

In [None]:
pb.line_resample(color='customer_state')

**Key Observations:**  

- São Paulo consistently has lowest shipping cost share among top states  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city')

**Key Observations:**  

- São Paulo city usually has lowest shipping cost share among top cities  

### 4.2.18 Delivery Time

In [None]:
pb.configure(
    df = df_sales.dropna(subset='delivery_time_days')
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'delivery_time_days'
    , metric_label = 'Average Delivery Time, days'
    , metric_label_for_distribution = 'Delivery Time, days'     
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
pb.box(mode='time_series', freq='M', upper_quantile=0.95).show()
for freq in ['D', 'ME']:
    pb.line_resample(
        freq=freq
        , to_slide=True if freq == 'ME' else False
    ).show()

**Key Observations:**  

- Average delivery time fluctuates daily (5-20 days)  
- Grew from August 2017-February 2018, then dropped sharply to ~8 days  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score', to_slide=True)

**Key Observations:**  

- Lower ratings typically correlate with longer delivery times  

**By Top Customer States**

In [None]:
pb.line_resample(color='customer_state', to_slide=True)

**Key Observations:**  

- São Paulo consistently has fastest delivery among top states  
- Rio de Janeiro and Rio Grande do Sul show slowest deliveries  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city', to_slide=True)

**Key Observations:**  

- Rio de Janeiro had steeper delivery time increases (Oct 2017-Feb 2018)  

### 4.2.19 Delivery Delay Time

**By Day and Month**

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , delivery_delay_days = 'Average Delivery Delay Time, days'
))
for freq, period in [('D', 'Day'), ('ME', 'Month')]:
    fig = df_sales.viz.line(
            x=labels.index[0]
            , y=labels.index[1]
            , agg_func='mean'
            , freq=freq
            , labels=labels
            , title=f'Average Delivery Delay Time by {period}'
    )
    if freq == 'ME':
        pb.to_slide(fig)
    fig.show()

**Key Observations:**  

- All months show faster-than-expected deliveries  
- Early 2017 to March 2017: improving lead times (~12 days ahead)  
- June 2018: deliveries ~20 days ahead of estimates  

### 4.2.20 Carrier Handoff Delay

**By Day and Month**

In [None]:
labels = pd.Series(dict(
    order_purchase_dt = 'Date'
    , avg_carrier_delivery_delay_days = 'Average Carrier Handoff Delay, days'
))
for freq, period in [('D', 'Day'), ('ME', 'Month')]:
    fig = df_sales.viz.line(
            x=labels.index[0]
            , y=labels.index[1]
            , agg_func='mean'
            , freq=freq
            , labels=labels
            , title=f'Average Carrier Handoff Delay by {period}'
    )
    if freq == 'ME':
        pb.to_slide(fig)
    fig.show()

**Key Observations:**  

- Carrier handoff consistently faster than limits  
- Pre-May 2017: improving lead times (peaked at 4.5 days ahead)  
- Post-May 2017: declining to ~2.5 days ahead by August  

### 4.2.21 Proportion of Each Stage in Delivery Time

Let's look at what percentage of the total delivery time each stage occupies. 

We will not consider any anomalous dates, as there are only a few and they will not significantly affect the result.


In [None]:
tmp_df_sales = (
    df_sales[[
        'order_purchase_dt',
        'order_approved_dt',
        'order_delivered_carrier_dt',
        'order_delivered_customer_dt',
    ]]
    [lambda x: (x.order_delivered_customer_dt >= x.order_purchase_dt) & (x.order_approved_dt >= x.order_purchase_dt)
        & (x.order_delivered_carrier_dt >= x.order_approved_dt) & (x.order_delivered_customer_dt >= x.order_delivered_carrier_dt)
     ]
    .dropna()
)

In [None]:
tmp_df_sales['from_purchase_to_customer'] = (tmp_df_sales['order_delivered_customer_dt'] - tmp_df_sales['order_purchase_dt']).dt.total_seconds()
tmp_df_sales['From Purchase to Approved'] = (
    (tmp_df_sales['order_approved_dt'] - tmp_df_sales['order_purchase_dt']).dt.total_seconds() * 100 / tmp_df_sales['from_purchase_to_customer']
).round(2)
tmp_df_sales['From Approved to Carrier'] = (
    (tmp_df_sales['order_delivered_carrier_dt'] - tmp_df_sales['order_approved_dt']).dt.total_seconds() * 100 / tmp_df_sales['from_purchase_to_customer']
).round(2)
tmp_df_sales['From Carrier to Customer'] = (
    (tmp_df_sales['order_delivered_customer_dt'] - tmp_df_sales['order_delivered_carrier_dt']).dt.total_seconds() * 100 / tmp_df_sales['from_purchase_to_customer']
).round(2) 

In [None]:
tmp_df_sales = (
    tmp_df_sales[['order_purchase_dt', 'From Purchase to Approved', 'From Approved to Carrier', 'From Carrier to Customer']]
    .melt(id_vars = 'order_purchase_dt', var_name='Stage', value_name='Percent of All Delivery Time')
    .rename(columns={'order_purchase_dt': 'Date'})
)

In [None]:
category_orders = {
    'Stage': ['From Purchase to Approved', 'From Approved to Carrier', 'From Carrier to Customer']
}
fig = tmp_df_sales.viz.area(
    x='Date'
    , y='Percent of All Delivery Time'
    , color='Stage'
    , agg_func='mean'
    , freq='ME'
    , title='Average Percent of All Delivery Time by Stage and Month'
    , category_orders=category_orders
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Carrier delivery consumes most of total delivery time  

### 4.2.22 Order Weight

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_weight_kg'
    , metric_label = 'Average Weight of Orders, kg'
    , metric_label_for_distribution = 'Weight of Orders, kg'   
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
pb.box(mode='time_series', freq='M', upper_quantile=0.95).show()
for freq in ['D', 'ME']:
    pb.line_resample(freq=freq).show()

**Key Observations:**  

- Average order weight declining monthly (2.8kg → 1.9kg)   

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score')

**Key Observations:**  

- 1-star orders typically heaviest  
- 2-star orders show most monthly weight variability  

**By Whether the Order is Delayed or Not**

In [None]:
pb.line_resample(color='is_delayed')

**Key Observations:**  

- Delayed orders usually heavier  
- Non-delayed weights more stable monthly  

**By Presence of Installment Payments**

In [None]:
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders consistently heavier  

**By Delivery Time Category**

In [None]:
pb.line_resample(color='delivery_time_days_cat')

**Key Observations:**  

- Lighter orders typically deliver faster  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city')

**Key Observations:**  

- Curitiba shows most monthly weight variability among top cities  

### 4.2.23 Number of Products in Order

Prepare dataframe for analysis.

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'products_cnt'
    , metric_label = 'Average Number of Products in Order'
    , metric_label_for_distribution = 'Number of Products in Order'   
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
for freq in ['D', 'ME']:
    pb.line_resample(freq=freq).show()

**Key Observations:**  

- Average products per order remains stable (1.12-1.16)  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score')

**Key Observations:**  

- 1/2-star orders typically have more products  
- 4/5-star orders usually have fewest products  

**By Whether the Order is Delayed or Not**

In [None]:
pb.box(mode='time_series', color='is_delayed', freq='M').show()
pb.line_resample(color='is_delayed')

**Key Observations:**  

- Delayed orders show more product count variability  

**By Presence of Installment Payments**

In [None]:
pb.box(mode='time_series', color='order_has_installment', freq='M').show()
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders usually contain more products  

**By Top Customer Cities**

In [None]:
pb.line_resample(color='customer_city')

**Key Observations:**  

- Curitiba shows most monthly product count variability  

### 4.2.24 Number of Unique Products in Order

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'unique_products_cnt'
    , metric_label = 'Average Number of Unique Products in Order'
    , metric_label_for_distribution = 'Number of Unique Products in Order'    
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
for freq in ['D', 'ME']:
    pb.line_resample(freq=freq).show()

**Key Observations:**  

- Average unique products per order remains stable (1.03-1.045)  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score')

**Key Observations:**  

- 1/2-star orders have more unique products  
- 4/5-star orders have fewest unique products  

**By Whether the Order is Delayed or Not**

In [None]:
pb.box(mode='time_series', color='is_delayed', freq='M').show()
pb.line_resample(color='is_delayed')

**Key Observations:**  

- Non-delayed orders slightly higher in unique products  
- Delayed orders show more variability  

**By Presence of Installment Payments**

In [None]:
pb.box(mode='time_series', color='order_has_installment', freq='M').show()
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders consistently have more unique products  

### 4.2.25 Product Price in Order

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'avg_products_price'
    , metric_label = 'Average Product Price in Order, R$'
    , metric_label_for_distribution = 'Product Price in Order, R$'  
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
pb.box(mode='time_series', freq='M', upper_quantile=0.95).show()
for freq in ['D', 'ME']:
    pb.line_resample(freq=freq).show()

**Key Observations:**  

- No Black Friday spike in average product price  
- Monthly average product price fluctuates (115-135 R$)  
- Clear seasonality:  
  - Pre-July 2017: decline  
  - July-Oct 2017: growth  
  - Oct 2017-Mar 2018: decline  
  - Mar-Apr 2018: growth  
  - Post-Apr 2018: decline  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score')

**Key Observations:**  

- 1-star orders typically have highest product prices  
- 3-star orders usually lowest  

**By Whether the Order is Delayed or Not**

In [None]:
pb.line_resample(color='is_delayed')

**Key Observations:**  

- Delayed orders usually have higher product prices  
- June 2017 saw sharp price spike in delayed orders  

**By Presence of Installment Payments**

In [None]:
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders have significantly higher product prices  

### 4.2.26 Number of Sellers in Order

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'sellers_cnt'
    , metric_label = 'Average Number of Sellers in Order'
    , metric_label_for_distribution = 'Number of Sellers in Order'    
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
for freq in ['D', 'ME']:
    pb.line_resample(freq=freq).show()

**Key Observations:**  

- Average sellers per order grows minimally (1→1.02)  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score')

**Key Observations:**  

- 1/2-star orders typically involve more sellers  
- 4/5-star orders involve fewest sellers  

**By Whether the Order is Delayed or Not**

In [None]:
pb.box(mode='time_series', color='is_delayed', freq='M').show()
pb.line_resample(color='is_delayed')

**Key Observations:**  

- Non-delayed orders slightly higher in seller count  
- April 2018 saw sharp seller count spike in undelivered orders  

**By Presence of Installment Payments**

In [None]:
pb.box(mode='time_series', color='order_has_installment', freq='M').show()
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders consistently involve more sellers  

### 4.2.27 Number of Categories in Order

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'product_categories_cnt'
    , metric_label = 'Average Number of Categories in Order'
    , metric_label_for_distribution = 'Number of Categories in Order'
    , agg_func = 'mean'
    , freq = 'ME'
)

**By Day and Month**

In [None]:
pb.box(mode='time_series', freq='M').show()
for freq in ['D', 'ME']:
    pb.line_resample(freq=freq).show()

**Key Observations:**  

- Average categories per order remains stable  

**By Review Score**

In [None]:
pb.line_resample(color='order_avg_reviews_score')

**Key Observations:**  

- 1/2-star orders involve more categories  

**By Presence of Installment Payments**

In [None]:
pb.box(mode='time_series', color='order_has_installment', freq='M').show()
pb.line_resample(color='order_has_installment')

**Key Observations:**  

- Installment orders have slightly more categories  

<h2 id="4-3"> 4.3 Customer Analysis</h2>

Let’s create a helper function.

In [None]:
def customer_top(metric: str, show_cnt: bool=True, ascending=False):
    """Show Top Customers by Metric"""
    cols = ['customer_unique_id', metric]
    if show_cnt:
        cols += ['orders_cnt']
    display(
        df_customers[cols]
        .sort_values(metric, ascending=ascending)
        .set_index('customer_unique_id')
        .head(10)
    )

### 4.3.1 Number of Customers

Let’s see the total number of customers.

In [None]:
print(f'Total customers: {df_customers.customer_unique_id.nunique():,}')

Let’s examine the daily distribution of customers.

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date' 
    , metric = 'customer_unique_id'
    , metric_label = 'Share of Customers'
    , metric_label_for_distribution = 'Number of Customers'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'    
)

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(freq='D')

**Key Observations:**  

- Typically 100-215 customers made purchases daily  
- 5% of days had ≤45 customers, another 5% had ≥291 customers  

Let’s look at the top days by the number of customers.

In [None]:
pb.metric_top(freq='D')

**Key Observations:**  

- As expected, Black Friday had the highest daily customer count  

### 4.3.2 Number of Purchases

Let’s identify the top customers.

In [None]:
customer_top('orders_cnt', show_cnt=False)

**Key Observations:**  

- User '8d50f5eadf50201ccdcedfb9e2ac8455' made the most purchases  

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['orders_cnt'].explore.info(
    labels=dict(orders_cnt='Number of Orders per Customer')
    , title='Distribution of Number of Orders per Customer'
    , xaxis_type='category'
)

**Key Observations:**  

- Most customers (97%) made only 1 purchase ever  
- Only 3% made >1 successful purchase  


### 4.3.3 Total Purchase Amount

Let’s identify the top customers.

In [None]:
customer_top('total_customer_payment')

**Key Observations:**  

- User '0a0a92112bd4c708ca5fde585afaa872' spent significantly more than others (single purchase)  


Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['total_customer_payment'].explore.info(
    labels=dict(total_customer_payment='Purchase Amount per Customer')
    , title='Distribution of Purchase Amount per Customer'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of customers spent <185 R$ lifetime  
- Top 5% spent ≥470 R$  

### 4.3.4 Average Order Value

Let’s identify the top customers.

In [None]:
customer_top('avg_total_order_payment')

**Key Observations:**  

- User '0a0a92112bd4c708ca5fde585afaa872' has highest average order value (single purchase)  


Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_total_order_payment'].explore.info(
    labels=dict(avg_total_order_payment='Average Order Value per Customer')
    , title='Distribution of Average Order Value per Customer'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of customers have average order value <180 R$  
- Top 5% have ≥445 R$  

### 4.3.5 Number of Canceled Orders

Let’s identify the top customers.

In [None]:
customer_top('canceled_orders_cnt')

**Key Observations:**  

- No user canceled >2 orders  

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['canceled_orders_cnt'].explore.info(
    labels=dict(canceled_orders_cnt='Number of Canceled Orders')
    , title='Distribution of Number of Canceled Orders per Customer'
    , xaxis_type='category'
)

**Key Observations:**  

- 99% of canceling users only canceled once  

### 4.3.6 Canceled Order Rate

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['canceled_share'].explore.info(
    labels=dict(canceled_share='Share of Canceled Orders')
    , title='Distribution of Share of Canceled Orders per Customer'
    , xaxis_type='category'
)

**Key Observations:**  

- 99% of users never canceled an order  

### 4.3.7 Repeat Purchase Rate

Let’s identify the top customers.

In [None]:
customer_top('repeat_purchase_share')

**Key Observations:**  

- User '8d50f5eadf50201ccdcedfb9e2ac8455' has highest repeat purchase rate  

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['repeat_purchase_share'].explore.info(
    labels=dict(repeat_purchase_share='Share of Repeat Purchases')
    , title='Distribution of Share of Repeat Purchases per Customer'
    , nbins=20
    , xaxis_type='category'
)

**Key Observations:**  

- 97% of customers have no repeat purchases  

### 4.3.8 Time Between Purchases

Let’s identify the top customers.

In [None]:
customer_top('avg_buys_diff_days')

**Key Observations:**  

- Many users show >500 days between purchases, but with very few purchases  
- Makes average values unreliable  

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_buys_diff_days'].explore.info(
    labels=dict(avg_buys_diff_days='Average Time Between Purchases, days')
    , title='Distribution of Average Time Between Purchases'
)

**Key Observations:**  

- 75% have ≤125 days between purchases  
- 5% have ≥311 days  
- ~30% have <1 day between purchases (likely consecutive orders)  

### 4.3.9 Number of Products per Order

Let’s identify the top customers.

In [None]:
customer_top('avg_products_cnt')

**Key Observations:**  

- Some users average 20-21 items/order (all single orders)  

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_products_cnt'].explore.info(
    labels=dict(avg_products_cnt='Average Number of Products in Order')
    , title='Distribution of Average Number of Products in Order per Customer'
    , width=600
)

**Key Observations:**  

- 87% average 1 item/order  
- ~1% average ≥3 items  

### 4.3.10 Product Price per Order

Let’s identify the top customers.

In [None]:
customer_top('avg_products_price')

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_products_price'].explore.info(
    labels=dict(avg_products_price='Average Product Price in Order')
    , title='Distribution of Average Product Price in Order per Customer'
    , nbins=20
)

**Key Observations:**  

- 75% have average product price ≤140 R$  
- Top 5% have ≥367 R$  

### 4.3.11 Number of Reviews

Let’s identify the top customers.

In [None]:
customer_top('reviews_cnt')

**Key Observations:**  

- User with id '8d50f5eadf50201ccdcedfb9e2ac8455' left significantly more reviews than other users. But they also made many orders.


Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['reviews_cnt'].explore.info(
    labels=dict(reviews_cnt='Number of Reviews per Customer')
    , title='Distribution of Number of Reviews per Customer'
    , nbins=20
)

**Key Observations:**  

- 94% left only 1 review  
- 2% left 2 reviews  

### 4.3.12 Review Score

Let’s identify the top customers.

In [None]:
customer_top('customer_avg_reviews_score')

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['customer_avg_reviews_score'].explore.info(
    labels=dict(customer_avg_reviews_score='Average Review Score per Customer')
    , title='Distribution of Average Review Score per Customer'
    , nbins=5
)

**Key Observations:**  

- 57% average 5-star reviews  
- 19% average 4-star  

### 4.3.13 Delivery Cost

Let’s identify the top customers.

In [None]:
customer_top('avg_order_total_freight_value')

**Key Observations:**  

- User 'fff5eb4918b2bf4b2da476788d42051c' has unusually high shipping costs (single purchase)  

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_order_total_freight_value'].explore.info(
    labels=dict(avg_order_total_freight_value='Average Freight Value ')
    , title='Distribution of Average Freight Value per Customer'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% have average shipping ≤24 R$  
- Top 5% have ≥54 R$  


### 4.3.14 Delivery Time

Let’s identify the top customers.

In [None]:
customer_top('avg_delivery_time_days')

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_delivery_time_days'].explore.info(
    labels=dict(avg_delivery_time_days='Distribution of Average Delivery Time, days')
    , title='Distribution of Average Delivery Time'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% have average delivery ≤16 days  
- Top 5% have ≥29 days  

### 4.3.15 Delivery Delay Time

Let’s identify the top customers.

In [None]:
customer_top('avg_delivery_delay_days')

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_delivery_delay_days'].explore.info(
    labels=dict(avg_delivery_delay_days='Average Delivery Delay Time, days')
    , title='Distribution of Average Delivery Delay Time'
    , lower_quantile=0.01
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- Top 5% have ≥25 days early delivery  
- Median: 6-16 days early  
- Bottom 5% have ≥4 days late  

### 4.3.16 Order Weight

Let’s identify the top customers.

In [None]:
customer_top('avg_order_total_weight_kg')

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['avg_order_total_weight_kg'].explore.info(
    labels=dict(avg_order_total_weight_kg='Average Order Weight per Customer')
    , title='Distribution of Average Order Weight per Customer'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- 75% have average order weight ≤2kg  
- Top 5% have ≥10kg  

### 4.3.17 Time from First to Second Purchase

Let’s identify the top customers.

In [None]:
customer_top('from_first_to_second_days')

**Key Observations:**  

- User 'd8f3c4f441a9b59a29f977df16724f38' has longest 1st→2nd purchase gap  

In [None]:
customer_top('from_first_to_second_days', ascending=True)

**Key Observations:**  

- Some customers made 1st/2nd purchases within seconds

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['from_first_to_second_days'].explore.info(
    labels=dict(from_first_to_second_days='Time From First to Second Purchase, days')
    , title='Distribution of Time From First to Second Purchase'
)

**Key Observations:**  

- ~50% have >29 days between 1st/2nd purchase  
- Top 25% have ≥125 days  
- Top 5% have ≥319 days  

### 4.3.18 Time from First to Last Purchase

Let’s identify the top customers.

In [None]:
customer_top('from_first_to_last_days')

**Key Observations:**  

- User 'd8f3c4f441a9b59a29f977df16724f38' has longest 1st→last purchase span (2 purchases)  


Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['from_first_to_last_days'].explore.info(
    labels=dict(from_first_to_last_days='Time From First to Last Purchase, days')
    , title='Distribution of Time From First to Last Purchase'
)

**Key Observations:**  

- ~50% have >35 days between 1st/2nd purchase  
- Top 25% have ≥140 days  
- Top 5% have ≥335 days  

### 4.3.19 Number of Months with Purchases

Let’s identify the top customers.

In [None]:
customer_top('months_with_buys')

**Key Observations:**  

- User '8d50f5eadf50201ccdcedfb9e2ac8455' had most months with purchases  


Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['months_with_buys'].explore.info(
    labels=dict(months_with_buys='Number of Months with Purchases')
    , title='Distribution of Number of Months with Purchases per Customer'
    , nbins=10 
)

**Key Observations:**  

- 96% of customers only purchased in 1 month  


### 4.3.20 Maximum Consecutive Months with Purchases

Let’s identify the top customers.

In [None]:
customer_top('max_consecutive_months_with_buys')

Let’s see at statistics and distribution of the metric.

In [None]:
df_customers['max_consecutive_months_with_buys'].explore.info(
    labels=dict(max_consecutive_months_with_buys='Maximum Consecutive Months with Purchases')
    , title='Distribution of Maximum Consecutive Months with Purchases'
    , nbins=10 
)

**Key Observations:**  

- Maximum consecutive months: 6 (1 customer)  
- 3 consecutive months: 8 customers  
- 2 consecutive months: 438 customers  

### 4.3.21 Additional Metrics

- **What percentage of customers make only one purchase?**

In [None]:
tmp_df_res = df_sales.groupby(['customer_unique_id'])['order_id'].nunique()

In [None]:
display(f'{(tmp_df_res[tmp_df_res == 1].count() * 100 / tmp_df_res.count()).round(2)}% of customers make a purchase only once')

- **What percentage of customers make more than one purchase?**

In [None]:
display(f'{(tmp_df_res[tmp_df_res > 1].count() * 100 / tmp_df_res.count()).round(2)}% of customers make more than one purchase')

- **What percentage of customers make more than two purchases?**

In [None]:
display(f'{(tmp_df_res[tmp_df_res > 2].count() * 100 / tmp_df_res.count()).round(2)}% of customers make more than two purchases')

- **What percentage of customers make more than three purchases?**

In [None]:
display(f'{(tmp_df_res[tmp_df_res > 3].count() * 100 / tmp_df_res.count()).round(2)}% of customers make more than two purchases')

- **What percentage of customers make 4 or more purchases?**

In [None]:
for n in range(4, 10):
    display(f'{(tmp_df_res[tmp_df_res > n].count() * 100 / tmp_df_res.count()).round(2)}% of customers make more than {n} two purchases')

- **Are there customers who make purchases regularly (monthly)?**

In [None]:
tmp_df_res = df_sales[['order_purchase_dt', 'customer_unique_id', 'order_id']].dropna(subset='order_purchase_dt')
tmp_df_res['year_month'] = tmp_df_res.order_purchase_dt.dt.to_period('M')
tmp_df_res['first_month'] = tmp_df_res.groupby('customer_unique_id')['year_month'].transform('min')
last_month = tmp_df_res['year_month'].max()

In [None]:
tmp_df_res = (tmp_df_res.groupby('customer_unique_id', as_index=False)
          .agg(
              year_months = ('year_month', 'nunique')
              , first_month = ('first_month', 'first')              
          )
)
tmp_df_res['all_months'] = (last_month - tmp_df_res.first_month).apply(lambda x: x.n + 1)
tmp_df_res['is_in_all_months'] = tmp_df_res['all_months'] == tmp_df_res['year_months']
tmp_df_res = tmp_df_res[tmp_df_res.is_in_all_months]

In [None]:
tmp_df_res.sort_values('all_months', ascending=False).head(10)

**Key Observations:**  

- No customers purchased in all months  
- Maximum regular purchases: 2 consecutive months  

<h2 id="4-4"> 4.4 Seller Analysis</h2>

Let’s create a helper function.

In [None]:
def seller_top(metric: str, show_cnt: bool=True, ascending=False):
    """Show Top Customers by Metric"""
    cols = ['seller_id', metric]
    if show_cnt:
        if metric == 'products_cnt':
            cols += ['orders_cnt']
        else:
            cols += ['products_cnt', 'orders_cnt']
    display(
        df_sellers[cols]
        .sort_values(metric, ascending=ascending)
        .set_index('seller_id')
        .head(10)
    )

### 4.4.1 Number of Products

Let’s identify the top sellers.

In [None]:
seller_top('products_cnt')

**Key Observations:**  

- Seller "6560211a19b47992c3666cc44a7e94c0" sold the most products  

Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['products_cnt'].explore.info(
    labels=dict(unique_products_cnt='Number of Products')
    , title='Distribution of Number of Products Per Seller'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers sold ≤26 products total  
- Top 5% sold >150 products  

### 4.4.2 Number of Unique Products

Let’s identify the top sellers.

In [None]:
seller_top('unique_products_cnt')

**Key Observations:**  

- Seller "4a3ca9315b744ce9f8e9374361493884" sold the most unique products  

Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['unique_products_cnt'].explore.info(
    labels=dict(unique_products_cnt='Number of Unique Products')
    , title='Distribution of Number of Unique Products Per Seller'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers sold ≤10 unique products  
- Top 5% sold >45 unique products  

### 4.4.3 Number of Orders

Let’s identify the top sellers.

In [None]:
seller_top('orders_cnt', show_cnt=False)

**Key Observations:**  

- Seller "6560211a19b47992c3666cc44a7e94c0" participated in the most orders  

Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['orders_cnt'].explore.info(
    labels=dict(orders_cnt='Number of Orders')
    , title='Distribution of Number of Orders Per Seller'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers participated in ≤22 orders  
- Top 5% participated in ≥130 orders  

### 4.4.4 Total Sales Revenue

Let’s identify the top sellers.

In [None]:
seller_top('revenue')

**Key Observations:**  

- Seller "4869f7a5dfa277a7dca6462dcf3b52b2" generated the most revenue  


Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['revenue'].explore.info(
    labels=dict(revenue='Seller Revenue')
    , title='Distribution of Seller Revenue'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers made ≤3,500 R$  
- Top 5% made ≥17,000 R$  

### 4.4.5 Number of Products per Order

Let’s identify the top sellers.

In [None]:
seller_top('avg_prouducts_cnt')

**Key Observations:**  

- Seller '0b36063d5818f81ccb94b54adfaebbf5' has highest average products per order (single order)  

Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['avg_prouducts_cnt'].explore.info(
    labels=dict(avg_prouducts_cnt='Average Number of Products in Order')
    , title='Distribution of Average Number of Products in Order per Sellers'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers average 1.14 products per order  
- Top 1% average ≥3 products  

### 4.4.6 Total Product Value per Order

Let’s identify the top sellers.

In [None]:
seller_top('avg_order_total_price')

**Key Observations:**  

- Sellers "e3b4998c7a498169dc7bce44e6bb6277" and "80ceebb4ee9b31afb6c6a916a574a1e2" had highest order values (single order each)  

Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['avg_order_total_price'].explore.info(
    labels=dict(avg_order_total_price='Average Amount of Products in Order, R$')
    , title='Distribution of Average Amount of Products in Order Per Seller'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers average ≤189 R$ per order  
- Top 5% average ≥641 R$  

### 4.4.7 Product Price per Order

Let’s identify the top sellers.

In [None]:
seller_top('avg_product_price')

**Key Observations:**  

- Seller "e3b4998c7a498169dc7bce44e6bb6277" has highest average product price (single order)  


Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['avg_product_price'].explore.info(
    labels=dict(avg_product_price='Average Product Price in Order, R$')
    , title='Distribution of Average Product Price in Order Per Seller'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers average ≤174 R$ per product  
- Top 5% average ≥595 R$  

### 4.4.8 Product Weight

Let’s identify the top sellers.

In [None]:
seller_top('avg_product_weight_kg')

**Key Observations:**  

- Maximum average product weight: 30kg  

Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['avg_product_weight_kg'].explore.info(
    labels=dict(avg_product_weight_kg='Average Weight of Products, kg')
    , title='Distribution of Average Weight of Products Per Seller'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of sellers average ≤2.7kg  
- Top 5% average ≥11kg  

### 4.4.9 Carrier Handoff Delay

Let’s identify the top sellers.

In [None]:
seller_top('avg_carrier_delivery_delay_days')

**Key Observations:**  

- Seller "586a871d4f1221763fddb6ceefdeb95e" had maximum carrier handoff delay: 45 days  


Let’s see at statistics and distribution of the metric.

In [None]:
df_sellers['avg_carrier_delivery_delay_days'].explore.info(
    labels=dict(avg_carrier_delivery_delay_days='Average Carrier Delivery Delay Time, days')
    , title='Distribution of Average Carrier Delivery Delay Time Per Seller'
    , lower_quantile=0.05
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- Top 5% of sellers delivered to carrier ≥6.5 days early  
- 75% delivered ≥2 days early  
- Bottom 5% delayed ≥1 day  

<h2 id="4-5"> 4.5 Sales Analysis</h2>

### 4.5.1 Number of Sales

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date' 
    , metric = 'order_id'
    , metric_label = 'Share of Sales'
    , metric_label_for_distribution = 'Number of Sales'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'
    , text_auto='.1%'
    , update_fig={'xaxis': {'tickformat': '.0%'}}
)

In [None]:
print(f'Total number of sales: {df_sales.order_id.nunique()}')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(freq='D')

**Key Observations:**  

- 75% of days had ≤215 orders  
- 5% had ≤45 orders  
- 5% had ≥293 orders  
- Several days exceeded 400 orders  

Let’s look by different dimensions.

**By Season**

Since 2018 has incomplete monthly data, it’s better to also analyze by year..

In [None]:
pb.bar_groupby(
    x='purchase_season'
    , color='purchase_year'
)

**Key Observations:**  

- Lowest sales in summer (both years)  
- Highest sales in autumn (2018)  

**By Time of Day**

In [None]:
pb.bar_groupby(y='purchase_time_of_day')

**Key Observations:**  

- Sales by time of day:  
  - Evening: 36% (peak)  
  - Night: 9% (lowest)  
  - Morning: 23%  
  - Afternoon: 32%  

**By Day of Week**

In [None]:
pb.bar_groupby(y='purchase_weekday')

**Key Observations:**  

- Saturday: 11% (lowest)  
- Monday: 16% (highest)  

**By Weekday vs Weekend**

In [None]:
pb.bar_groupby(y='purchase_day_type')

**Key Observations:**  

- 77% of orders were placed on weekdays  

**By Day of the Week and Hour of the Day**

In [None]:
fig = pb.heatmap(
    x='purchase_hour'
    , y='purchase_weekday'
    , labels={'color': 'Number of Sales'}
    , title='Number of Sales by Day of the Week and Hour of the Day'
).update_layout(xaxis_dtick=1, xaxis_tickformat=None)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- 1AM-8AM had lowest sales across all weekdays  

**By Review Score**

In [None]:
pb.bar_groupby(y='order_avg_reviews_score')

**Key Observations:**  

- Review score distribution:  
  - 5 stars: 59%  
  - 2 stars: 3% (lowest)  
  - More 1-star than 2/3-star orders  

**By Whether the Order is Delayed or Not**

In [None]:
pb.bar_groupby(y='is_delayed')

**Key Observations:**  

- 92% of orders had no delivery delay  

**By Payment Category**

In [None]:
pb.bar_groupby(y='order_total_payment_cat')

**Key Observations:**  

- 63% of orders were medium-priced  

**By Order Weight Category**

In [None]:
pb.bar_groupby(y='order_total_weight_cat')

**Key Observations:**  

- Order weight distribution:  
  - Medium: 46%  
  - Light: 40%  

**By Delivery Time Category**

In [None]:
pb.bar_groupby(y='delivery_time_days_cat')

**Key Observations:**  

- 59% of orders had medium delivery time  

**By Presence of Installment Payments**

In [None]:
pb.bar_groupby(y='order_has_installment')

**Key Observations:**  

- 51% of orders used installments  

**By Payment Type**

In [None]:
pb.bar_groupby(y='order_payment_types')

**Key Observations:**  

- Payment methods:  
  - Credit card: 75%  
  - Boleto: 20%  

**By Product Category**

In [None]:
pb.bar_groupby(
    y='order_product_categories'
    , text_auto=False
)

**Key Observations:**  

- Top 3 product categories:  
  1. Bed Bath Table: 9%  
  2. Health Beauty: 9%  
  3. Sports Leisure: 8%  

**By Generalized Product Category**

In [None]:
pb.bar_groupby(
    y='order_general_product_categories'
    , text_auto=False
)

**Key Observations:**  

- Top 3 generalized categories:  
  1. Electronics: 27%  
  2. Furniture: 18%  
  3. Home & Garden: 14%  

**By Top Customer States**

In [None]:
pb.bar_groupby(y='customer_state', text_auto=False)

**Key Observations:**  

- Sales by state:  
  - São Paulo: 42%  
  - Rio de Janeiro: 13%  
  - Minas Gerais: 12%  
  - Others: ≤6%  

**By Top Customer Cities**

In [None]:
pb.bar_groupby(y='customer_city', text_auto=False)

**Key Observations:**  

- Sales by city:  
  - São Paulo: 16%  
  - Rio de Janeiro: 7%  
  - Others: ≤3%  

### 4.5.2 Sum of Sales

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , metric = 'total_payment'
    , metric_label = 'Sum of Sales, R$'
    , agg_func = 'sum'
    , title_base = 'Sum and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

In [None]:
print(f'Total Sales Amount: {df_sales.total_payment.sum():,.2f}, R$')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(freq='D')

**Key Observations:**  

- 75% of days had sales ≤33K R$  
- 5% of days had ≤6.7K R$  
- 5% of days had ≥49K R$  
- Several days exceeded 70K R$  

Let’s look by different dimensions.

**By Season**

Since 2018 has incomplete monthly data, it’s better to also analyze by year..

In [None]:
pb.bar_groupby(
    x='purchase_season'
    , color='purchase_year'
    , title='Sum of Sales by Season and Year'
)

**Key Observations:**  

- Lowest sales revenue in summer (both years)  
- Highest revenue in autumn (2018)  

**By Time of Day**

In [None]:
pb.bar_groupby(y='purchase_time_of_day', show_count=True, to_slide=True)

**Key Observations:**  

- Highest sales volume and revenue in evenings  
- Lowest at night  

**By Day of Week**

In [None]:
pb.bar_groupby(y='purchase_weekday', show_count=True, to_slide=True)

**Key Observations:**  

- Monday has highest sales volume/revenue  
- Saturday has lowest 

**By Weekday vs Weekend**

In [None]:
pb.bar_groupby(y='purchase_day_type', show_count=True, to_slide=True)

**Key Observations:**  

- Weekday sales/revenue significantly higher than weekends  


**By Day of the Week and Hour of the Day**

In [None]:
fig = pb.heatmap(
    x='purchase_hour'
    , y='purchase_weekday'
    , text_auto='.3s'
    , labels={'color': 'Amount, R$'}
    , title='Sales Amount by Day of the Week and Hour of the Day'
).update_layout(xaxis_dtick=1)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- 1AM-8AM has lowest revenue across all weekdays  

**By Whether the Order is Delayed or Not**

In [None]:
pb.bar_groupby(y='is_delayed', show_count=True)

**Key Observations:**  

- Non-delayed orders have significantly higher sales/revenue  

**By Order Weight Category**

In [None]:
pb.bar_groupby(y='order_total_weight_cat', show_count=True, to_slide=True)

**Key Observations:**  

- Medium-weight orders generate more revenue than heavy/light  
- Light orders have higher quantity share but lower revenue share  
- Heavy orders are more expensive  

**By Presence of Installment Payments**

In [None]:
pb.bar_groupby(y='order_has_installment', show_count=True, to_slide=True)

**Key Observations:**  

- Installment orders generate significantly more revenue despite similar order counts  
- Installment enables more expensive purchases  

**By Top Customer States**

In [None]:
pb.bar_groupby(y='customer_state', show_count=True, to_slide=True)

**Key Observations:**  

- São Paulo state dominates sales volume/revenue  
- Rio de Janeiro and Minas Gerais rank 2nd/3rd  

**By Top Customer Cities**

In [None]:
pb.bar_groupby(y='customer_city', show_count=True, to_slide=True)

**Key Observations:**  

- São Paulo city leads in sales volume/revenue  
- Rio de Janeiro ranks second  

**By Review Score**

In [None]:
pb.bar_groupby(y='order_avg_reviews_score', show_count=True, to_slide=True)

**Key Observations:**  

- 5-star reviews have highest sales/revenue  
- 2-star reviews have lowest  
- 1-star reviews exceed 2/3-star in volume/revenue  

**By Payment Type**

Since a single order can have multiple payments, we will measure transaction volume based on payment count.

In [None]:
pb.bar_groupby(
    y='order_payment_types'
    , show_count=True
    , to_slide=True
)

**Key Observations:**  

- Credit card leads payment methods (volume/revenue)  
- Boleto ranks second  

**By Product Category**

For the category product split, we cannot take the payment amount. We will calculate the sum based on the product price and freight value. 

The count will be determined by the number of items.

In [None]:
pb.bar_groupby(
    y='order_general_product_categories'
    , show_count=True
    , to_slide=True
)

**Key Observations:**  

- Electronics leads categories (volume/revenue)  
- Furniture ranks second  
- Furniture has smaller price gap in quantity vs revenue  

### 4.5.3 Average Order Value

In [None]:
pb.configure(
    df = df_sales
    , metric = 'total_payment'
    , metric_label = 'Average Order Value, R$'
    , metric_label_for_distribution = 'Order Value, R$'
    , agg_func = 'mean'
    , title_base = 'Average Order Value and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}    
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(total_payment='Order Value, R$')
    , title='Distribution of Order Value'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of orders ≤177 R$  
- 5% ≤33 R$  
- 5% ≥445 R$  
- Many outliers >1000 R$  

Let’s look by different dimensions.

**By Season**

Since 2018 has incomplete monthly data, it’s better to also analyze by year..

In [None]:
pb.bar_groupby(
    x='purchase_season'
    , color='purchase_year'
    , title='Average Order Value by Season and Year'
)

**Key Observations:**  

- Summer 2017 had higher order values  
- Other seasons slightly higher in 2018  

**By Day of the Week and Hour of the Day**

In [None]:
fig = pb.heatmap(
    x='purchase_hour'
    , y='purchase_weekday'
    , text_auto='.1f'
    , labels={'color': 'AOV, R$'}
    , title='Average Order Value by Day of the Week and Hour of the Day'
).update_layout(xaxis_dtick=1)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Nighttime doesn't always have lowest average order value  
- Some weekday nights show value peaks  

**By Whether the Order is Delayed or Not**

In [None]:
pb.histogram(
    color='is_delayed'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=True
    , show_kde=False
    , nbins=30
).show()
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
).show()

**Key Observations:**  

- Non-delayed orders have lower average values  

**By Order Weight Category**

In [None]:
pb.histogram(
    color='order_total_weight_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=True
    , show_kde=False
    , nbins=30
).show()
pb.bar_groupby(
    y='order_total_weight_cat'
    , show_count=True
).show()

**Key Observations:**  

- Heavier orders have higher average values  

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=True
    , show_kde=False
    , nbins=30
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Installment orders have much higher average values  

**By Top Customer States**

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_state'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- São Paulo has most orders but lowest average value among top states  
- Para has highest average order value  

**By Top Customer Cities**

In [None]:
pb.box(
    y='customer_city'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_city'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Rio de Janeiro combines high volume with high average value  
- Salvador has highest average order value among top cities  

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
    , nbins=30
).show()
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- 1-star reviews have highest order values  
- 2-star reviews rank second  
- Expensive orders receive more low ratings  

### 4.5.4 Reviews Score

In [None]:
pb.configure(
    df = df_sales
    , metric = 'order_avg_reviews_score'
    , metric_label = 'Average Order Reviews Score'
    , metric_label_for_distribution = 'Order Reviews Score'
    , agg_func = 'mean'
    , title_base = 'Average Order Reviews Score and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}        
)

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(order_avg_reviews_score='Order Reviews Score')
    , title='Distribution of Order Reviews Score'
    , xaxis_type='category'
)

**Key Observations:**  

- 59% of orders have 5-star reviews  

Let’s look by different dimensions.

**By Season**

In [None]:
pb.bar_groupby(
    x='purchase_season'
    , color='purchase_year'
    , title='Average Order Reviews Score by Season and Year'
)

**Key Observations:**  

- Winter 2018 had slightly higher ratings  
- Other seasons slightly higher in 2017 

**By Day of the Week and Hour of the Day**

In [None]:
pb.heatmap(
    x='purchase_hour'
    , y='purchase_weekday'
    , text_auto='.1f'
    , title='Average Order Reviews Score by Day of the Week and Hour of the Day'
    , labels=dict(color = 'Score')
).update_layout(xaxis_dtick=1)

**Key Observations:**  

- Nighttime shows rating extremes (especially Thursdays)  

**By Delivery Delay Status**

In [None]:
pb.cat_compare(
    cat2='is_delayed'
    , visible_graphs=[2]
)
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Non-delayed orders have significantly higher ratings  
- Higher 5-star share for on-time deliveries  
- "Unknown" delivery status orders mostly get 1-star  

**By Delivery Time Category**

In [None]:
pb.cat_compare(
    cat2='delivery_time_days_cat'
    , visible_graphs=[2]
)
pb.bar_groupby(
    y='delivery_time_days_cat'
    , show_count=True
).show()

**Key Observations:**  

- Faster deliveries get better ratings  

**By Customer State**

In [None]:
pb.cat_compare(
    cat2='customer_state'
    , visible_graphs=[2]
    , trim_top_n_cat2=7
)
fig = pb.bar_groupby(
    y='customer_state'
    , show_count=True
).update_layout(xaxis_domain=[0, 0.4], xaxis2_domain=[0.6, 1])
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Maranhão has lowest average rating among top states  
- Rio de Janeiro and Bahia have highest 1-star share 

**By Customer City**

In [None]:
pb.cat_compare(
    cat2='customer_city'
    , visible_graphs=[2]
    , trim_top_n_cat2=7
)
pb.bar_groupby(
    y='customer_city'
    , show_count=True
).update_layout(xaxis_domain=[0, 0.4], xaxis2_domain=[0.6, 1]).show()

**Key Observations:**  

- Rio de Janeiro and Porto Alegre have notable 1-star concentrations  


### 4.5.5 Order Weight

In [None]:
pb.configure(
    df = df_sales
    , metric = 'total_weight_kg'
    , metric_label = 'Average Weight of Order, kg'
    , metric_label_for_distribution = 'Weight of Order, kg'
    , title_base = 'Average Weight of Order and Number of Sales'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}        
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(total_weight_kg='Weight of Order, kg')
    , title='Distribution of Weight of Order'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of orders ≤2kg  
- 5% ≤150g  
- 5% ≥10kg  

Let’s look by different dimensions.

**By Season**

Since 2018 has incomplete monthly data, it’s better to also analyze by year..

In [None]:
pb.bar_groupby(
    x='purchase_season'
    , color='purchase_year'
    , title='Average Weight of Order by Season and Year'
)

**Key Observations:**  

- 2017 had heavier orders across all seasons  

**By Time of Day**

In [None]:
pb.histogram(
    color='purchase_time_of_day'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_time_of_day'
    , show_count=True
).show()

**Key Observations:**  

- Afternoons have heaviest orders  
- Nights have lightest  

**By Whether the Order is Delayed or Not**

In [None]:
pb.histogram(
    color='is_delayed'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Delayed orders are heavier  

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
    , to_slide=True
)

**Key Observations:**  

- Installment orders are heavier  

**By Top Customer States**

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_state'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Mato Grosso has heaviest average orders among top states  

**By Top Customer Cities**

In [None]:
pb.box(
    y='customer_city'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_city'
    , show_count=True
    , to_slide=True
)

**Key Observations:**  

- Santos and Rio de Janeiro have heaviest average orders  

**By Review Score**

In [None]:
pb.bar_groupby(y='order_avg_reviews_score', show_count=True, to_slide=True)

**Key Observations:**  

- 1-star reviews have significantly heavier orders  
- 2-star reviews rank second  
- Heavy orders receive lower ratings  

### 4.5.6 Number of Products per Order

In [None]:
pb.configure(
    df = df_sales
    , metric = 'products_cnt'
    , metric_label = 'Average Number of Products in Order'
    , metric_label_for_distribution = 'Number of Products in Order'
    , title_base = 'Number of Products in Order and Number of Sales'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}          
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(products_cnt='Number of Products in Order')
    , title='Distribution of Number of Products in Order'
)

**Key Observations:**  

- 90% of orders contain single product  
- Two anomalies had 20-21 products  

Let’s look by different dimensions.

**By Whether the Order is Delayed or Not**

In [None]:
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
).show()

**Key Observations:**  

- Non-delayed orders have slightly more products  

**By Review Score**

In [None]:
pb.bar_groupby(y='order_avg_reviews_score', show_count=True, to_slide=True)

**Key Observations:**  

- 1/2-star reviews have more products per order  

### 4.5.7 Product Price per Order

In [None]:
pb.configure(
    df = df_sales
    , metric = 'avg_products_price'
    , metric_label = 'Average Product Price in Order, R$'
    , metric_label_for_distribution = 'Product Price in Order, R$'
    , agg_func = 'mean'
    , title_base = 'Average Product Price in Order and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}         
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(avg_products_price='Average Product Price in Order, R$')
    , title='Distribution of Average Product Price in Order'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of orders have average product price ≤140 R$  
- 5% have ≥363 R$  

Let’s look by different dimensions.

**By Season**

Since 2018 has incomplete monthly data, it’s better to also analyze by year..

In [None]:
pb.bar_groupby(
    x='purchase_season'
    , color='purchase_year'
    , title='Average Product Price in Order by Season and Year'
)

**Key Observations:**  

- Summer/fall 2017 had higher product prices  
- Winter 2018 was higher  

**By Time of Day**

In [None]:
pb.histogram(
    color='purchase_time_of_day'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_time_of_day'
    , show_count=True
).show()

**Key Observations:**  

- Nighttime has lower product prices  

**By Whether the Order is Delayed or Not**

In [None]:
pb.histogram(
    color='is_delayed'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
).show()

**Key Observations:**  

- Delayed orders have higher product prices  

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Installment orders have significantly higher product prices  

**By Top Customer States**

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_state'
    , show_count=True
).show()

**Key Observations:**  

- Para has highest average product price among top states  
- São Paulo has lowest  

**By Top Customer Cities**

In [None]:
pb.box(
    y='customer_city'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_city'
    , show_count=True
).show()

**Key Observations:**  

- Brasília, Rio de Janeiro and Salvador have highest product prices among top cities.  

### 4.5.8 Number of Sellers per Order

In [None]:
pb.configure(
    df = df_sales
    , metric = 'sellers_cnt'
    , metric_label = 'Average Number of Sellers in Order'
    , agg_func = 'mean'
    , axis_sort_order='descending'
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(sellers_cnt='Number of Sellers in Order')
    , title='Distribution of Number of Sellers in Order'
    , xaxis_type='category'
)

**Key Observations:**  

- 99% of orders have single seller  

### 4.5.9 Number of Categories per Order

In [None]:
pb.configure(
    df = df_sales
    , metric = 'product_categories_cnt'
    , metric_label = 'Average Number of Categories in Order'
    , agg_func = 'mean'
    , axis_sort_order='descending'
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(product_categories_cnt='Number of Categories in Order')
    , title='Distribution of Number of Categories in Order'
    , xaxis_type='category'
)

**Key Observations:**  

- 99% of orders have single category  

<h2 id="4-6"> 4.6 Product Analysis</h2>

### 4.6.1 Number of Products

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_sales_cnt'
    , metric_label = 'Share of Sold Products'
    , agg_func = 'sum'
    , norm_by='all'
    , axis_sort_order='descending'
    , text_auto='.1%'
    , update_fig={'xaxis': {'tickformat': '.0%'}}
)

In [None]:
print(f'Total sold products count: {df_products.product_sales_cnt.sum():,.0f}')

Let's look at the top values of the metrics

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(product_sales_cnt='Number of Units Sold per Product')
    , title='Distribution of Number of Units Sold per Product'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of products sold 1-2 units total  
- Top 5% sold ≥10 units  

Let's look at the statistics and distribution of the number of sold products per day.

In [None]:
tmp_df_res = (
    df_sales.merge(df_items, on='order_id', how='left')
    .groupby(pd.Grouper(key='order_purchase_dt', freq='D'), observed=False)['product_id']
    .nunique()
    .to_frame('products_cnt_per_day')
)

In [None]:
tmp_df_res['products_cnt_per_day'].explore.info(
    labels=dict(orders_cnt_per_day='Number of Sold Products per Day')
    , title='Distribution of Number of Sold Products per Day'
)

**Key Observations:**  

- 75% of days sold ≤207 products  
- Top 5% sold ≥277 products  
- Several days exceeded 400 products  

Let’s look by different dimensions.

**By Product Category**

In [None]:
fig = pb.bar_groupby(
    y='product_category'
    , trim_top_n_y=20
    , width=1100
    , height=500   
    , show_top_and_bottom_n = 15
    , show_count=False
).update_layout(xaxis_domain=[0, 0.4], xaxis2_domain=[0.6, 1], xaxis2_tickformat='.2%')
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Best-selling categories: Bed Bath Table, Health Beauty  
- Lowest-selling: Security and Services  

**By Generalized Product Category**

In [None]:
pb.bar_groupby(y='general_product_category', to_slide=True)

**Key Observations:**  

- Top 3 generalized categories by units sold:  
  1. Electronics (27%)  
  2. Furniture (19%)  
  3. Home & Garden (15%)  
- Lowest: Food & Drinks (1%)  

### 4.6.2 Product Price

In [None]:
pb.configure(
    df = df_products
    , metric = 'avg_price'
    , metric_label = 'Average Product Price, R$'
    , metric_label_for_distribution = 'Product Price, R$'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(product_sales_cnt='Average Product Price, R$')
    , title='Distribution of Average Product Price'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of products had average price ≤153 R$  
- Bottom 5% ≤17 R$  
- Top 5% ≥470 R$  

Let’s see at statistics and distribution of the metric per day.

In [None]:
tmp_df_res = (
    df_sales.merge(df_items, on='order_id', how='left')
    .groupby(pd.Grouper(key='order_purchase_dt', freq='D'), observed=False)['price']
    .mean()
    .to_frame('avg_price_per_day')
)

In [None]:
tmp_df_res['avg_price_per_day'].explore.info(
    labels=dict(avg_price_per_day='Average Product Price per Day, R$')
    , title='Distribution of Average Product Price per Day, R$'
)

**Key Observations:**  

- Daily average product prices:  
  - Bottom 5% ≤94 R$  
  - Middle 50% 108-130 R$  
  - Top 5% ≥162 R$  

Let’s look by different dimensions.

**By Product Category**

In [None]:
print('Top Best')
pb.box(y='product_category').show()
print('Top Worst')
pb.box(
    y='product_category'
    , trim_top_n_direction='bottom'
).show()
pb.bar_groupby(
    y='product_category'
    , show_top_and_bottom_n=15
    , to_slide=True
).show()

**Key Observations:**  

- Highest priced category: Watches Gifts  
- Lowest priced: Flowers  

**By Generalized Product Category**

In [None]:
pb.box(y='general_product_category').show()
fig = pb.bar_groupby(
    y='general_product_category'
    , show_count=True
).update_layout(xaxis2_title_text='Number of Sold Products')
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Top 3 categories by average price:  
  1. Industry & Construction  
  2. Electronics  
  3. Fashion  
- Lowest: Food & Drinks 

### 4.6.3 Sales Amount of Products

In [None]:
pb.configure(
    df = df_products
    , metric = 'total_sales_amount'
    , metric_label = 'Total Sales Amount of Products, R$'
    , metric_label_for_distribution = 'Total Sales Amount per Product, R$'
    , agg_func = 'sum'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric per day.

In [None]:
tmp_df_res = (
    df_sales.merge(df_items, on='order_id', how='left')
    .groupby(pd.Grouper(key='order_purchase_dt', freq='D'), observed=False)['price']
    .sum()
    .to_frame('total_price_per_day')
)

In [None]:
tmp_df_res['total_price_per_day'].explore.info(
    labels=dict(avg_price_per_day='Total Product Price per Day, R$')
    , title='Distribution of Total Product Price per Day, R$'
)

**Key Observations:**  

- 75% of days had product revenue ≤29K R$  
- Top 5% ≥42K R$  

Let’s look by different dimensions.

**By Product Category**

In [None]:
print('Top Best')
pb.box(y='product_category').show()
print('Top Worst')
pb.box(
    y='product_category'
    , trim_top_n_direction='bottom'
).show()
pb.bar_groupby(
    y='product_category'
    , show_top_and_bottom_n=15
    , horizontal_spacing=0.25
    , to_slide=True
).show()

**Key Observations:**  

- Highest revenue categories: Health beauty, Watches gifts  
- Lowest: Security and services  

**By Generalized Product Category**

In [None]:
pb.box(y='general_product_category').show()
fig = (
    pb.bar_groupby(y='general_product_category', show_count=True)
    .update_layout(
        xaxis2_title_text='Number of Sold Products'
    )
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Top 3 categories by revenue:  
  1. Electronics  
  2. Furniture  
  3. Home & Garden  
- Lowest: Food & Drinks  

### 4.6.4 Sales Amount per Product

In [None]:
pb.configure(
    df = df_products
    , metric = 'total_sales_amount'
    , metric_label = 'Average Sales Amount per Products, R$'
    , metric_label_for_distribution = 'Total Sales Amount per Product, R$'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'    
)

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(total_sales_amount='Total Sales Amount per Product, R$')
    , title='Distribution of Total Sales Amount per Product'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of products generated ≤325 R$ lifetime revenue  

Let’s look by different dimensions.

**By Product Category**

In [None]:
print('Top Best')
pb.box(y='product_category').show()
print('Top Worst')
pb.box(
    y='product_category'
    , trim_top_n_direction='bottom'
).show()
pb.bar_groupby(
    y='product_category'
    , show_top_and_bottom_n=15
    , horizontal_spacing=0.25
    , to_slide=True
)

**Key Observations:**  

- Highest average revenue per product: Watches gifts  
- Lowest: Flowers  

**By Generalized Product Category**

In [None]:
pb.box(y='general_product_category').show()
pb.bar_groupby(y='general_product_category', to_slide=True).show()

**Key Observations:**  

- Top 3 categories by average revenue:  
  1. Electronics  
  2. Beauty & Health  
  3. Industry & Construction  
- Lowest: Books & Stationery 

### 4.6.5 Price Range

In [None]:
pb.configure(
    df = df_products
    , metric = 'price_range'
    , metric_label = 'Average Price Range per Product, R$'
    , metric_label_for_distribution = 'Price Range per Product, R$'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.2f'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 80% of products maintained stable prices  
- 5% had price changes ≥20 R$  

Let’s look by different dimensions.

**By Product Category**

In [None]:
print('Top Best')
pb.box(y='product_category').show()
print('Top Worst')
pb.box(
    y='product_category'
    , trim_top_n_direction='bottom'
).show()
pb.bar_groupby(
    y='product_category'
    , show_top_and_bottom_n=15
    , horizontal_spacing=0.25
    , to_slide=True
).show()

**Key Observations:**  

- Most price volatility: Watches gifts  

**By Generalized Product Category**

In [None]:
pb.box(y='general_product_category').show()
pb.bar_groupby(y='general_product_category', to_slide=True)

**Key Observations:**  

- Top 3 categories by price changes:  
  1. Electronics  
  2. Industry & Construction  
  3. Beauty & Health  
- Lowest volatility: Food & Drinks  

### 4.6.6 Quantity of Product per Order

In [None]:
pb.configure(
    df = df_products
    , metric = 'avg_product_qty_per_order'
    , metric_label = 'Average Quantity of Product per Order'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 85% of products appeared as single units in orders  

Let’s look by different dimensions.

**By Generalized Product Category**

In [None]:
pb.box(y='general_product_category').show()
pb.bar_groupby(y='general_product_category').show()

**Key Observations:**  

- Highest average quantity per order: Food & Drinks  

### 4.6.7 Length of Product Name

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_name_lenght'
    , metric_label = 'Length of Product Name'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of products have names ≤57 characters  

### 4.6.8 Length of Product Description

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_description_lenght'
    , metric_label = 'Length of Product Description'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of products have descriptions ≤1000 characters  

### 4.6.9 Number of Product Photos

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_photos_qty'
    , metric_label = 'Number of Product Photos'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 52% of products have 1 photo  
- Top 5% have ≥6 photos  

### 4.6.10 Product Weight

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_weight_g'
    , metric_label = 'Product Weight, g'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- 75% of products weigh ≤1.9kg  
- Top 5% weigh ≥11kg  

### 4.6.11 Product Length

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_length_cm'
    , metric_label = 'Product Length, cm'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of products are ≤38cm long  
- Top 5% ≥65cm  

### 4.6.12 Product Width

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_width_cm'
    , metric_label = 'Product Width, cm'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of products are ≤30cm wide  
- Top 5% ≥47cm  

### 4.6.13 Product Height

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_height_cm'
    , metric_label = 'Product Height, cm'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of products are ≤21cm tall  
- Top 5% ≥44cm  

### 4.6.14 Product Volume

In [None]:
pb.configure(
    df = df_products
    , metric = 'product_volume_cm3'
    , metric_label = 'Product Volume, cm3'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of products have volume ≤19K cm3
- Top 5% ≥64K cm3

### 4.6.15 Weight to Volume Ratio

In [None]:
pb.configure(
    df = df_products
    , metric = 'weight_to_volume_ratio'
    , metric_label = 'Product Weight to Volume Ratio'
)

Top products.

In [None]:
pb.metric_top(id_column='product_id')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.99
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- 75% of products have weight/volume ratio ≤0.2  
- Top 5% ≥0.5  

### 4.6.16 Other metrics

**What fraction of products were not sold at all?**

We have missing values in the number of sold units for products that were never sold.

In [None]:
products_no_sales_share = (df_products.total_units_sold.isna()).mean()

In [None]:
print(f'Share of Products with No Sales: {products_no_sales_share:.1%}')

<h2 id="4-7"> 4.7 Review Analysis</h2>

### 4.7.1 Number of Reviews

In [None]:
pb.configure(
    df = df_reviews
    , time_column = 'review_creation_dt'
    , metric = 'review_id'
    , metric_label = 'Share of Reviews'
    , metric_label_for_distribution = 'Number of Reviews'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'
    , text_auto='.1%'
    , update_fig={'xaxis': {'tickformat': '.0%'}}    
)

In [None]:
print(f'Total number of reviews: {df_reviews.review_id.nunique():,}')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(freq='D')

**Key Observations:**  

- Typical day: 1 review created  
- 75% of days had ≤270 reviews  
- Top 5% ≥375 reviews  

Let’s look by different dimensions.

**By Day of Week**

In [None]:
pb.bar_groupby(y='review_creation_weekday', to_slide=True)

**Key Observations:**  

- Fewest reviews on Mondays  
- Sundays slightly more than Mondays but still low  
- Possible review registration pattern  

**By Day Type**

In [None]:
pb.bar_groupby(y='review_day_type')

**Key Observations:**  

- 76% of reviews created on weekdays  
- Matches fewer weekend days  

**By Review Score**

In [None]:
pb.bar_groupby(y='review_score', to_slide=True)

**Key Observations:**  

- Review score distribution:  
  - 5 stars: 58%  
  - 4 stars: 19%  
  - 1 star: 12%  
  - 3 stars: 8%  
  - 2 stars: 3% 

### 4.7.2 Review score

In [None]:
pb.configure(
    df = df_reviews
    , time_column = 'review_creation_dt'
    , metric = 'review_score'
    , metric_label = 'Average Review score'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

In [None]:
print(f'Average Review score: {df_reviews.review_score.mean():.2f}')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(review_score='Review score')
    , title='Distribution of Review score'
    , xaxis_type='category'
)

**Key Observations:**  

- 58% of reviews had score 5.

Let’s see at statistics and distribution of the metric per day.

In [None]:
pb.metric_info(freq='D')

**Key Observations:**  

- Daily average ratings:  
  - Bottom 5% <3.26  
  - Middle 50% 3.9-4.3  
  - Top 5% >4.6  

Let’s look by different dimensions.

**By Day Type**

In [None]:
pb.cat_compare(cat2='review_day_type'
            , visible_graphs=[2]
)
pb.bar_groupby(y='review_day_type').show()

**Key Observations:**  

- Weekdays have slightly higher ratings  
- More 5-star reviews weekdays  
- More 1-star reviews weekends  

**By Day of Week**

In [None]:
pb.cat_compare(cat2='review_creation_weekday'
            , visible_graphs=[2]
)
pb.bar_groupby(y='review_creation_weekday').show()

**Key Observations:**  

- Sundays have lowest ratings  
- Highest 1-star share on Sundays  
- Lowest 5-star share on Sundays  

### 4.7.3 Review Answer Time

In [None]:
pb.configure(
    df = df_reviews
    , time_column = 'review_creation_dt'
    , metric = 'review_answer_time_days'
    , metric_label = 'Average Review Answer Time, days'
    , metric_label_for_distribution = 'Review Answer Time, days'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

In [None]:
print(f'Average Review Answer Time: {df_reviews.review_answer_time_days.mean():.2f} days')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- Review response time bimodal: ~1 day and ~3.5 days  
- 75% responded within 3.1 days  
- Top 5% took ≥7 days  

Let’s see at statistics and distribution of the metric per day.

In [None]:
pb.metric_info(freq='D')

**Key Observations:**  

- 5% of review days had average response time ≥5.85 days  


Let’s look by different dimensions.

**By Day of Week**

In [None]:
pb.histogram(color='review_creation_weekday').show()
pb.bar_groupby(y='review_creation_weekday').show()

**Key Observations:**  

- Slowest responses to Friday reviews  
- Fastest responses to Monday reviews  

### 4.7.4 Comment Message Lenght

In [None]:
pb.configure(
    df=df_reviews
    , time_column='review_creation_dt'
    , metric='review_comment_message_len'
    , metric_label='Median Review Comment Message Lenght'
    , metric_label_for_distribution='Review Comment Message Lenght'
    , agg_func='median'
    , text_auto='.3s'
)

In [None]:
print(f'Median Review comment message lenght: {df_reviews.review_comment_message_len.median():.2f}')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 75% of reviews have messages ≤100 characters  

Let’s look by different dimensions.

**By Review Score**

In [None]:
pb.bar_groupby(y='review_score', to_slide=True)

**Key Observations:**  

- Lower ratings correlate with longer messages  
- Negative reviews tend to be more detailed  

### 4.7.5 NPS

For calculating NPS, we will divide customers into the following groups:

- Promoters: customers who gave a rating of 5
- Passive: customers who gave a rating of 4
- Detractors: customers who gave a rating of 1-3

Let's look at how NPS changed by month.

In [None]:
tmp_df_res = (
    df_reviews.pivot_table(index=pd.Grouper(key='review_creation_dt', freq='D'), columns='review_score', values='review_id', aggfunc='nunique')
)
tmp_df_res['total_responses'] = tmp_df_res.sum(axis=1)
tmp_df_res['promoters'] = tmp_df_res[5]
tmp_df_res['detractors'] = tmp_df_res[1] + tmp_df_res[2] + tmp_df_res[3]
tmp_df_res['nps'] = (tmp_df_res['promoters'] - tmp_df_res['detractors']) * 100 / tmp_df_res['total_responses']
tmp_df_res.reset_index(inplace=True)

Let’s see at statistics and distribution of the metric per day.

In [None]:
tmp_df_res['nps'].explore.info(
    labels=dict(nps='NPS per Day')
    , title='Distribution of NPS per Day'
)

**Key Observations:**  

- Only ~5% of days had good NPS (>50)  
- 5% had negative NPS  
- Indicates customer dissatisfaction spikes  

### 4.7.6 Comment Title

Let's look at the word cloud from review titles.

In [None]:
df_reviews.viz.wordcloud('review_comment_title')

**Key Observations:**  

- Most review titles use positive language  

Let's look at the top words by frequency.

In [None]:
fig = df_reviews.analysis.word_frequency(
    'review_comment_title'
    , text_auto=True
    , title='Top 10 Most Frequent Words in Review Title'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Most common title words: "recomend", "excellent"  

Let’s analyze the sentiment of the text.

In [None]:
df_reviews.analysis.sentiment('review_comment_title')

**Key Observations:**  

- ~10% of titles are negative  
- Sentiment IQR above 0 (neutral/positive bias)  

### 4.7.7 Comment Message

Let's look at the word cloud and the top words by frequency from the review messages.

In [None]:
df_reviews.viz.wordcloud('review_comment_message')
fig = df_reviews.analysis.word_frequency(
    'review_comment_message'
    , text_auto=True
    , title='Top 10 Most Frequent Words in Review Message'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Many words relate to delivery  
- Most common review word: "product"  

Let’s analyze the sentiment of the text.

In [None]:
df_reviews.analysis.sentiment('review_comment_message')

**Key Observations:**  

- ~15% of messages are negative  
- Overall sentiment leans positive  

### 4.7.8 Impact of Rating on Review Text

**Score 1**

Let's look at the word cloud, top 20 words by frequency, and the emotional tone of the text for a rating of 1.

In [None]:
df_reviews[lambda x: x.review_score==1].viz.wordcloud('review_comment_message')
df_reviews[lambda x: x.review_score==1].analysis.word_frequency('review_comment_message').show()
df_reviews[lambda x: x.review_score==1].analysis.sentiment('review_comment_message')

**Key Observations:**  

- 1-star reviews:  
  - Contain negative words  
  - Clearly negative sentiment (IQR <0)  

---

**Score 2**

Let's look at the word cloud, top 20 words by frequency, and the emotional tone of the text for a rating of 2.

In [None]:
df_reviews[lambda x: x.review_score==2].viz.wordcloud('review_comment_message')
df_reviews[lambda x: x.review_score==2].analysis.word_frequency('review_comment_message').show()
df_reviews[lambda x: x.review_score==2].analysis.sentiment('review_comment_message')

**Key Observations:**  

- 2-star reviews:  
  - Contain negative words  
  - Mostly negative sentiment  

---

**Score 3**

Let's look at the word cloud, top 20 words by frequency, and the emotional tone of the text for a rating of 3.

In [None]:
df_reviews[lambda x: x.review_score==3].viz.wordcloud('review_comment_message')
df_reviews[lambda x: x.review_score==3].analysis.word_frequency('review_comment_message').show()
df_reviews[lambda x: x.review_score==3].analysis.sentiment('review_comment_message')

**Key Observations:**  

- 3-star reviews:  
  - Fewer negative words  
  - Leans positive overall  

---

**Score 4**

Let's look at the word cloud, top 20 words by frequency, and the emotional tone of the text for a rating of 4.

In [None]:
df_reviews[lambda x: x.review_score==4].viz.wordcloud('review_comment_message')
df_reviews[lambda x: x.review_score==4].analysis.word_frequency('review_comment_message').show()
df_reviews[lambda x: x.review_score==4].analysis.sentiment('review_comment_message')

**Key Observations:**  

- 4-star reviews:  
  - Many positive words  
  - Clearly positive sentiment  

---

**Score 5**

Let's look at the word cloud, top 20 words by frequency, and the emotional tone of the text for a rating of 5.

In [None]:
df_reviews[lambda x: x.review_score==5].viz.wordcloud('review_comment_message')
df_reviews[lambda x: x.review_score==5].analysis.word_frequency('review_comment_message').show()
df_reviews[lambda x: x.review_score==5].analysis.sentiment('review_comment_message')

**Key Observations:**  

- 5-star reviews:  
  - Dominated by positive words  
  - Strongly positive sentiment  

<h2 id="4-8"> 4.8 Delivery Analysis</h2>

### 4.8.1 Delivery Cost

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , metric = 'total_freight_value'
    , metric_label = 'Average Freight Value per Order, R$'
    , metric_label_for_distribution = 'Freight Value per Order, R$'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

In [None]:
print(f'Average Freight Value per Order: {df_sales.total_freight_value.mean():.2f} R$')

Top Orders.

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- 75% of orders have shipping costs ≤24 R$  
- Top 5% have shipping costs ≥54.7 R$  
- Several extreme outliers exist with very high shipping costs 

In [None]:
pb.metric_top(freq='D')

Let’s look by different dimensions.

**By Whether the Order is Delayed or Not**

In [None]:
pb.histogram(
    color='is_delayed'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
    , nbins=30
).show()
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
).show()

**Key Observations:**  

- Delayed orders have higher shipping costs than non-delayed  

**By Order Weight Category**

In [None]:
pb.histogram(
    color='order_total_weight_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=True
    , show_kde=False
    , nbins=30
).show()
pb.bar_groupby(
    y='order_total_weight_cat'
).show()

**Key Observations:**  

- Heavier orders have higher shipping costs (expected pattern)  

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=True
    , show_kde=False
    , nbins=30
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Installment orders have higher shipping costs  

**By Top Customer States**

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
fig = pb.bar_groupby(
    y='customer_state'
    , show_count=True
).update_layout(xaxis2_title_text='Number of Sales')
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Among top states by sales volume:  
  - São Paulo has lowest average shipping costs  
  - Maranhão has highest  

**By Top Customer Cities**

In [None]:
pb.box(
    y='customer_city'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_city'
    , show_count=True
).update_layout(xaxis2_title_text='Number of Sales')

**Key Observations:**  

- Among top cities by sales volume, highest average shipping costs in:  
  1. Salvador  
  2. Porto Alegre  
  3. Brasília  

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
    , nbins=30
).show()
fig = pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
).update_layout(xaxis2_title_text='Number of Sales')
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Higher shipping costs correlate with lower order ratings  


### 4.8.2 Distance Between Customer and Seller

In [None]:
pb.configure(
    df = df_sales
    , time_column = 'order_purchase_dt'
    , metric = 'avg_distance_km'
    , metric_label = 'Average Distance, km'
    , metric_label_for_distribution = 'Average Distance, km'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.1f'
)

In [None]:
print(f'Average Distance: {df_sales.avg_distance_km.mean():.2f} km')

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- 75% of orders have seller-buyer distance ≤800km  
- 5% ≤16.5km  
- 5% ≥2,000km  
- Several extreme outliers (>4,000km)  

Let’s look by different dimensions.

**By Whether the Order is Delayed or Not**

In [None]:
pb.histogram(
    color='is_delayed'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
    , nbins=30
).show()
pb.bar_groupby(
    y='is_delayed'
    , to_slide=True
).show()

**Key Observations:**  

- Delayed orders have greater average seller-buyer distance  


**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=True
    , show_kde=False
    , nbins=30
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
    , to_slide=True
)

**Key Observations:**  

- Installment orders have greater average seller-buyer distance  


### 4.8.3 Delivery Time

#### 4.8.3.1 Proportion of Each Stage in Delivery Time

Let's look at what percentage of the total delivery time each stage occupies. 

We will not consider any anomalous dates, as there are only a few and they will not significantly affect the result.

In [None]:
tmp_df_sales = (
    df_sales[[
        'order_purchase_dt',
        'order_approved_dt',
        'order_delivered_carrier_dt',
        'order_delivered_customer_dt',
    ]]
    [lambda x: (x.order_delivered_customer_dt >= x.order_purchase_dt) & (x.order_approved_dt >= x.order_purchase_dt)
        & (x.order_delivered_carrier_dt >= x.order_approved_dt) & (x.order_delivered_customer_dt >= x.order_delivered_carrier_dt)
     ]
    .dropna()
)

In [None]:
tmp_df_sales['from_purchase_to_customer'] = (tmp_df_sales['order_delivered_customer_dt'] - tmp_df_sales['order_purchase_dt']).dt.total_seconds()
tmp_df_sales['From Purchase to Approved'] = (
    (tmp_df_sales['order_approved_dt'] - tmp_df_sales['order_purchase_dt']).dt.total_seconds() * 100 / tmp_df_sales['from_purchase_to_customer']
).round(2)
tmp_df_sales['From Approved to Carrier'] = (
    (tmp_df_sales['order_delivered_carrier_dt'] - tmp_df_sales['order_approved_dt']).dt.total_seconds() * 100 / tmp_df_sales['from_purchase_to_customer']
).round(2)
tmp_df_sales['From Carrier to Customer'] = (
    (tmp_df_sales['order_delivered_customer_dt'] - tmp_df_sales['order_delivered_carrier_dt']).dt.total_seconds() * 100 / tmp_df_sales['from_purchase_to_customer']
).round(2) 

In [None]:
tmp_df_sales = (
    tmp_df_sales[['order_purchase_dt', 'From Purchase to Approved', 'From Approved to Carrier', 'From Carrier to Customer']]
    .melt(id_vars = 'order_purchase_dt', var_name='Stage', value_name='Percent of All Delivery Time')
    .rename(columns={'order_purchase_dt': 'Date'})
)

Let's look at what percentage of the total delivery time each stage occupies on average.

In [None]:
sorted_means = tmp_df_sales.groupby('Stage')['Percent of All Delivery Time'].mean().sort_values(ascending=False)

In [None]:
annotations_data = [
    (0.6, -0.1, 'Carrier > Customer'),
    (-0.05, 0.8, 'Approved > Carrier'),
    (0.45, 1.08, 'Purchase > Approved')
]
fig = px.pie(
    values=sorted_means.values,
    names=sorted_means.index,
    title='Average Delivery Time Distribution by Stage',
    labels={'names': 'Delivery Stage', 'values': 'Percentage of Total Time'},
    category_orders={'names': ['From Carrier to Customer', 'From Approved to Carrier']},
    hole=0.4 
)
fig.update_traces(
    textinfo='percent',  
    textposition='inside', 
    texttemplate='%{percent:.1%}', 
    hovertemplate='%{label}: %{percent:.1%}', 
)
fig.update_layout(
    showlegend=False, 
    width=500,
    height=400,
    margin=dict(t=60),
    title_y=0.97
)
for x, y, text in annotations_data:
    fig.add_annotation(
        x=x,
        y=y,
        text=text,
        showarrow=False,
        font=dict(size=12)
    )
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Delivery time distribution:  
  - Payment approval: 4%  
  - Carrier handoff: 25.5%  
  - Carrier delivery: 70.5%  

Look at distribution.

In [None]:
tmp_df_sales.viz.box(
    x='Percent of All Delivery Time'
    , y='Stage'
    , title='Percent of All Delivery Time by Stage'
)

**Key Observations:**  

- Carrier delivery consumes most of total delivery time  
- Significant differences between stages (non-overlapping IQRs)  


#### 4.8.3.2 Total Delivery Time

In [None]:
pb.configure(
    df = df_sales
    , metric = 'delivery_time_days'
    , metric_label = 'Average Order Delivery Time, days'
    , metric_label_for_distribution = 'Order Delivery Time, days'
    , agg_func = 'mean'
    , title_base = 'Average Order Delivery Time and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(delivery_time_days='Order Delivery Time, days')
    , title='Distribution of Order Delivery Time'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- Median delivery time: ≥10 days  
- 75% deliver in ≥16 days  
- Top 5% take ≥30 days  

Let’s look by different dimensions.

**By Day of Week**

In [None]:
pb.histogram(
    color='purchase_weekday'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_weekday'
    , show_count=True
).show()

**Key Observations:**  

- Friday/Saturday orders have slightly longer delivery times  

**By Payment Category**

In [None]:
pb.histogram(
    color='order_total_payment_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_payment_cat'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- More expensive orders take longer to deliver  

**By Order Weight Category**

In [None]:
pb.histogram(
    color='order_total_weight_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_weight_cat'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Heavy orders take longer to deliver than light/medium  

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- 1-star rated orders have noticeably longer delivery times  


**By Top Customer States**

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_state'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Among top states by sales volume, top 3 states with longest delivery times:  
  1. Pará  
  2. Maranhão  
  3. Ceará 

**By Top Customer Cities**

In [None]:
pb.box(
    y='customer_city'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_city'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Among top cities by sales volume, top 3 cities with longest delivery times:  
  1. Salvador  
  2. Porto Alegre  
  3. Rio de Janeiro   

#### 4.8.3.3 Delivery Delay

In [None]:
pb.configure(
    df = df_sales
    , metric = 'delivery_delay_days'
    , metric_label = 'Average Delivery Delay, days'
    , metric_label_for_distribution = 'Delivery Delay, days'
    , agg_func = 'mean'
    , title_base = 'Average Delivery Delay and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(delivery_time_days='Delivery Delay, days')
    , title='Distribution of Delivery Delay'
    , lower_quantile=0.01
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of orders deliver ≥6 days early  
- ~5% are ≥4 days late 

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Higher rated orders deliver earlier than estimated  

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_state'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Among top states by sales volume, top 3 states for early delivery:  
  1. Mato Grosso  
  2. Pará  
  3. Rio Grande do Sul  

#### 4.8.3.4 From Purchase to Approved Time

In [None]:
pb.configure(
    df = df_sales
    , metric = 'from_purchase_to_approved_hours'
    , metric_label = 'Average Order Processing Time, hour'
    , metric_label_for_distribution = 'Order Processing Time, hour'
    , agg_func = 'mean'
    , title_base = 'Average Order Processing Time and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(from_purchase_to_approved_hours='Order Processing Time, hour')
    , title='Distribution of Order Processing Time'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of orders take ≥14 hours to process  
- Top 5% take ≥48 hours  

Let’s look by different dimensions.

**By Day of Week**

In [None]:
pb.histogram(
    color='purchase_weekday'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_weekday'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Friday/Saturday orders process slowest  
- Wednesday orders process fastest  

**By Time of Day**

In [None]:
pb.histogram(
    color='purchase_time_of_day'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_time_of_day'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Nighttime orders take longer to process  

**By Whether the Order is Delayed**

In [None]:
pb.histogram(
    color='is_delayed'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='is_delayed'
    , show_count=True
).show()

**Key Observations:**  

- Non-delayed orders process faster (expected pattern)

**By Weekday vs Weekend**

In [None]:
pb.histogram(
    color='purchase_day_type'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_day_type'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Weekday orders process significantly faster than weekends  

**By Payment Category**

In [None]:
pb.histogram(
    color='order_total_payment_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_payment_cat'
    , show_count=True
).show()

**Key Observations:**  

- Cheap/expensive orders process faster than mid-priced  

**By Order Weight Category**

In [None]:
pb.histogram(
    color='order_total_weight_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_weight_cat'
    , show_count=True
).show()

**Key Observations:**  

- Heavy orders take longer to process  

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Installment orders process much faster  

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
).show()

**Key Observations:**  

- 1/2-star rated orders took longer to process  

#### 4.8.3.5 From Approval to Carrier Time

In [None]:
pb.configure(
    df = df_sales
    , metric = 'from_approved_to_carrier_days'
    , metric_label = 'Average Order Approval to Carrier Time, days'
    , metric_label_for_distribution = 'Order Approval to Carrier Time, days'
    , agg_func = 'mean'
    , title_base = 'Average Order Approval to Carrier Time and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(from_approved_to_carrier_days='Order Approval to Carrier Time, days')
    , title='Distribution of Order Approval to Carrier Time'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of orders transfer to carrier within ≤3.5 days  
- Top 5% take ≥8 days  

Let’s look by different dimensions.

**By Day of Week**

In [None]:
pb.histogram(
    color='purchase_weekday'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_weekday'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Friday/Saturday orders take longest to transfer to carrier 

**By Time of Day**

In [None]:
pb.histogram(
    color='purchase_time_of_day'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='purchase_time_of_day'
    , show_count=True
).show()

**Key Observations:**  

- Morning orders transfer fastest to carrier   

**By Payment Category**

In [None]:
pb.histogram(
    color='order_total_payment_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_payment_cat'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Expensive orders take longer to transfer to carrier  

**By Order Weight Category**

In [None]:
pb.histogram(
    color='order_total_weight_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_weight_cat'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Heavy orders take longer to transfer to carrier     

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
).show()

**Key Observations:**  

- Installment orders take slightly longer to transfer to carrier 

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
).show()

**Key Observations:**  

- Faster carrier transfer correlates with higher ratings  

#### 4.8.3.6 Carrier Delivery Time

In [None]:
pb.configure(
    df = df_sales
    , metric = 'from_carrier_to_customer_days'
    , metric_label = 'Average Delivery Time from Carrier, days'
    , metric_label_for_distribution = 'Delivery Time from Carrier, days'
    , agg_func = 'mean'
    , title_base = 'Average Delivery Time from Carrier and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(from_carrier_to_customer_days='Delivery Time from Carrier, days')
    , title='Distribution of Delivery Time from Carrier'
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- Median carrier delivery time: ≥7 days  
- 25% take ≥12 days  
- 5% take ≥24 days  

Let’s look by different dimensions.

**By Payment Category**

In [None]:
pb.histogram(
    color='order_total_payment_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_payment_cat'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Cheap items deliver fastest via carrier  

**By Order Weight Category**

In [None]:
pb.histogram(
    color='order_total_weight_cat'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_total_weight_cat'
    , show_count=True
).show()

**Key Observations:**  

- Light items deliver slightly faster via carrier   

**By Presence of Installment Payments**

In [None]:
pb.histogram(
    color='order_has_installment'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_has_installment'
    , show_count=True
).show()

**Key Observations:**  

- Installment orders take slightly longer via carrier  

**By Review Score**

In [None]:
pb.histogram(
    color='order_avg_reviews_score'
    , upper_quantile=0.95
    , mode='dual_box_trim'
    , show_box=True
    , show_hist=False
    , show_kde=True
).show()
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
).show()

**Key Observations:**  

- Longer carrier delivery times correlate with lower ratings  

**By Top Customer States**

In [None]:
pb.box(
    y='customer_state'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_state'
    , show_count=True
    , to_slide=True
).show()

- Among top states by sales volume, top 3 states with longest carrier delivery:  
  1. Pará  
  2. Maranhão  
  3. Ceará  

**By Top Customer Cities**

In [None]:
pb.box(
    y='customer_city'
    , upper_quantile=0.95
    , show_dual=True
).show()
pb.bar_groupby(
    y='customer_city'
    , show_count=True
    , to_slide=True
).show()

**Key Observations:**  

- Among top cities by sales volume, top 3 cities with longest carrier delivery:  
  1. Salvador  
  2. Porto Alegre  
  3. Rio de Janeiro  

#### 4.8.3.7 Carrier Handoff Delay

In [None]:
pb.configure(
    df = df_sales
    , metric = 'avg_carrier_delivery_delay_days'
    , metric_label = 'Average Carrier Delivery Delay, days'
    , metric_label_for_distribution = 'Carrier Delivery Delay, days'
    , agg_func = 'mean'
    , title_base = 'Average Carrier Delivery Delay and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.3s'
    , update_fig={'xaxis2': {'title_text': 'Number of Sales'}}
)

Top Orders

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    lower_quantile=0.01
    , upper_quantile=0.99
    , hist_mode='dual_hist_trim'    
)

**Key Observations:**  

- 75% of orders transfer to carrier ≥1.6 days early  
- Extreme early transfers due to data anomalies  
- 5% are ≥0.79 days late  
- 1% are ≥7 days late  

Let’s look by different dimensions.

**By Review Score**

In [None]:
pb.bar_groupby(
    y='order_avg_reviews_score'
    , show_count=True
).show()

**Key Observations:**  

- Earlier carrier transfer correlates with higher ratings  

<h2 id="4-9"> 4.9 Payment Analysis</h2>

### 4.9.1 Number of Payments

In [None]:
pb.configure(
    df = df_payments
    , metric = 'payment_sequential'
    , metric_label = 'Share of Payments'
    , metric_label_for_distribution = 'Number of Payments' 
    , agg_func = 'count'
    , norm_by = 'all'
    , axis_sort_order='descending'
    , text_auto='.1%'
    , update_fig={'xaxis': {'tickformat': '.0%'}}
)

In [None]:
print(f'Total number of payments in order: {df_payments.payment_sequential.count():,}')

Let's look at the statistics and distribution of the number of payments per day.

In [None]:
tmp_df_res = (
    df_orders.merge(df_payments, on='order_id', how='left')
    .groupby(pd.Grouper(key='order_purchase_dt', freq='D'), observed=False)['payment_sequential']
    .count()
    .to_frame('payments_cnt_per_day')
)

In [None]:
tmp_df_res['payments_cnt_per_day'].explore.info(
    labels=dict(payments_cnt_per_day='Number of Payments per Day')
    , title='Distribution of Number of Payments per Day'
)

**Key Observations:**  

- 75% of days have ≤230 payments  
- Top 5% have ≥312 payments  
- Several days exceeded 500 payments  

Let's look at top days.

In [None]:
tmp_df_res.sort_values('payments_cnt_per_day', ascending=False).head()

**Key Observations:**  

- Black Friday had anomalous payment volumes  

**By Payment Type**

In [None]:
pb.bar_groupby(y='payment_type', to_slide=True)

**Key Observations:**  

- Payment method distribution:  
  - Credit card: 74%  
  - Boleto: 19%  
  - Voucher: 5.5%  
  - Debit card: 1.5%  

### 4.9.2 Payment Value

In [None]:
pb.configure(
    df = df_payments
    , metric = 'payment_value'
    , metric_label = 'Average Payment Value, R$'
    , metric_label_for_distribution = 'Payment Value, R$'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.3s'
)

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    upper_quantile=0.95
    , hist_mode='dual_hist_trim'        
)

**Key Observations:**  

- 75% of payments are ≤172 R$  
- Top 5% are ≥440 R$  
- Several exceed 6,000 R$  

**By Payment Type**

In [None]:
pb.bar_groupby(y='payment_type', to_slide=True)

**Key Observations:**  

- Credit card payments have highest average value  
- Vouchers have lowest  

### 4.9.3 Number of Payment Installments

In [None]:
pb.configure(
    df = df_payments
    , metric = 'payment_installments'
    , metric_label = 'Average Payment Installments'
    , metric_label_for_distribution = 'Payment Installments'
    , agg_func = 'mean'
    , axis_sort_order='descending'
    , text_auto='.2s'
)

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info()

**Key Observations:**  

- 51% of payments are single-installment  
- Some have ≥9 installments 

**By Payment Type**

In [None]:
pb.bar_groupby(y='payment_type', to_slide=True)

**Key Observations:**  

- Credit cards average 3.5 installments  
- Other methods have no installments  

<h2 id="4-10"> 4.10 Geographical Analysis</h2>

### 4.10.1 Geo-analysis by ZIP Codes

#### 4.10.1.1 Data Preparation

We will aggregate data by the first 3 digits of the customer’s ZIP code. Each point on the map will represent a unique truncated ZIP code. 

Without truncation, we would get excessive detail and no noticeable differences. 

We will prepare the data for visualization.

We have 6 orders that were made outside South America. 

After truncating the prefixes, we may get additional coordinates outside South America. 

We will remove them to avoid interfering with the map analysis.

In [None]:
tmp_df_res = df_sales.copy()
tmp_df_res['is_delayed'] = tmp_df_res['is_delayed'] == 'Delayed'
tmp_df_res['order_has_installment'] = tmp_df_res['order_has_installment'] == 'Has Installments'

In [None]:
tmp_df_res = (
    tmp_df_res.merge(df_customers_origin[['customer_id', 'customer_zip_code_prefix_3_digits']], on='customer_id')
    .groupby('customer_zip_code_prefix_3_digits', as_index=False)
    .agg(
        total_orders=('order_id', 'nunique')
        , orders_delayed_share = ('is_delayed', 'mean')
        , total_payment = ('total_payment', 'sum')
        , aov = ('total_payment', 'mean')
        , avg_installments = ('total_installments_cnt', 'mean')
        , first_orders_cnt = ('sale_is_customer_first_purchase', 'sum')
        , installment_orders_cnt = ('order_has_installment', 'sum')
        , total_reviews = ('reviews_cnt', 'sum')
        , avg_review_score = ('order_avg_reviews_score', 'mean')
        , avg_delivery_delay_days = ('delivery_delay_days', 'mean')
        , avg_delivery_time_days = ('delivery_time_days', 'mean')
        , avg_products_cnt = ('products_cnt', 'mean')
        , avg_freight_ratio = ('freight_ratio', 'mean')
        , avg_order_weight_kg = ('total_weight_kg', 'mean')
        , avg_order_volume_cm3 = ('total_volume_cm3', 'mean')
    ) 
)

In [None]:
tmp_df_res = (
    df_orders.assign(
        is_canceled = lambda x: x.order_status=='Canceled'
    )
    .merge(df_customers_origin, on='customer_unique_id')
    .groupby('customer_zip_code_prefix_3_digits', as_index=False)
    .agg(
        cancel_rate = ('is_canceled', 'mean')
    )
    .merge(tmp_df_res, on='customer_zip_code_prefix_3_digits', how='right')
    .merge(df_geolocations[lambda x:x.in_south_america], left_on = 'customer_zip_code_prefix_3_digits', right_on='geolocation_zip_code_prefix_3_digits')
)

In [None]:
tmp_df_res['repeat_purchase_rate'] = (tmp_df_res['total_orders'] - tmp_df_res['first_orders_cnt']) / tmp_df_res['total_orders']
tmp_df_res['installment_orders_rate'] = tmp_df_res['installment_orders_cnt'] / tmp_df_res['total_orders']

We will calculate the average MAU by the truncated ZIP code.

In [None]:
temp_mau = (
    df_sales.merge(df_customers_origin[['customer_id', 'customer_zip_code_prefix_3_digits']], on='customer_id')
    .groupby(['customer_zip_code_prefix_3_digits', pd.Grouper(key='order_purchase_dt', freq='ME')], observed=False)
    .agg(mau = ('customer_unique_id', 'nunique'))
    .groupby('customer_zip_code_prefix_3_digits', observed=False)
    .mean()
)
tmp_df_res = tmp_df_res.merge(temp_mau, on='customer_zip_code_prefix_3_digits', how='left')

In [None]:
del temp_mau

#### 4.10.1.2 Data Visualization

We will create labels for displaying on the maps.

In [None]:
labels_for_map = dict(
    total_orders = 'Number of Sales'
    , total_payment = 'Sales Amount, R$'
    , aov = 'Average Order Value, R$'
    , mau = 'MAU'
    , avg_freight_ratio = 'Average Freight Ratio'
    , avg_delivery_time_days = 'Average Delivery Time, days'
    , avg_review_score = 'Average Review Score'
    , orders_delayed_share = 'Percentage of Delayed Orders'
    , avg_products_cnt = 'Average Number of Products in Order'
    , installment_orders_rate = 'Installment Payment Rate'
    , avg_installments = 'Average Number of Installments in Order'
    , avg_order_weight_kg = 'Average Order Weight, kg'
    , avg_order_volume_cm3 = 'Average Order Volume, cm3'
    , repeat_purchase_rate = 'Repeat Purchase Rate'
    , cancel_rate = 'Cancel Rate'
    , geolocation_lat = 'Latitude'
    , geolocation_lng = 'Longitude'
    , customer_zip_code_prefix_3_digits = 'Zip Code Prefix'
)

We will create a function for visualization.

In [None]:
def plot_map_zip(metric: str):
    """Create plotly map by 3-digit zip code prefix"""
    title = f"Distribution of {labels_for_map[metric].split(',')[0]} by 3-Digit Zip Code Prefix"
    colorbar_title = labels_for_map[metric].split(',')[1] if ',' in labels_for_map[metric] else None
    hover_data = {'geolocation_lat': False, 'geolocation_lng': False, 'customer_zip_code_prefix_3_digits': True}
    is_percentage = metric in [
        'avg_freight_ratio', 'orders_delayed_share', 
        'installment_orders_rate', 'repeat_purchase_rate', 
        'cancel_rate']
    if metric != 'total_orders':
        hover_data['total_orders'] = ':.2s'
    if not is_percentage:
        hover_data[metric] = ':.2s'
    else:
        hover_data[metric] = ':.1%'
    fig = px.scatter_map(
        tmp_df_res,
        lat='geolocation_lat',
        lon='geolocation_lng',
        color=metric,
        labels=labels_for_map,
        zoom=3,
        height=650,
        hover_data=hover_data,
        width=700,
        title=title,
        color_continuous_scale="matter",
        center={"lat": -14.235004, "lon": -55.92528}
    )
    if is_percentage:
        fig.update_coloraxes(
            colorbar_tickformat=".0%"
        )
    fig.update_layout(
        margin = dict(l=10, r=10, b=10, t=30)
        , title_y=0.99
        , coloraxis_colorbar_title_text = colorbar_title
    )
    pb.to_slide(fig)
    return fig

**Where are the sales volume higher?**

In [None]:
plot_map_zip('total_orders')

**Where is the sales amount higher?**

In [None]:
plot_map_zip('total_payment')

**Where is the average order value higher?**

In [None]:
plot_map_zip('aov')

**Where is average MAU higher?**

In [None]:
plot_map_zip('mau')

**Where do customers pay more for delivery?**

In [None]:
plot_map_zip('avg_freight_ratio')

**How is delivery time distributed across regions?**

In [None]:
plot_map_zip('avg_delivery_time_days')

**What is the average rating by regions?**

In [None]:
plot_map_zip('avg_review_score')

**How are delayed orders distributed across regions?**

In [None]:
plot_map_zip('orders_delayed_share')

**How is the number of items per order distributed across regions?**

In [None]:
plot_map_zip('avg_products_cnt')

**What is the higher proportion of installment payments in which regions?**

In [None]:
plot_map_zip('installment_orders_rate')

**What regions have more installments per order?**

In [None]:
plot_map_zip('avg_installments')

**What regions have the heaviest orders?**

In [None]:
plot_map_zip('avg_order_weight_kg')

**What regions have the largest volume orders?**

In [None]:
plot_map_zip('avg_order_volume_cm3')

**What regions have a higher repeat purchases rate?**

In [None]:
plot_map_zip('repeat_purchase_rate')

**What regions have a higher proportion of canceled orders?**

In [None]:
plot_map_zip('cancel_rate')

### 4.10.2 Geo-analysis by State

#### 4.10.2.1 Data Preparation

Creating dataframe for visualization

In [None]:
tmp_df_res = (
    df_sales.merge(df_customers_origin[['customer_id', 'customer_state_short', 'population', ]], on='customer_id', how='left')
)
tmp_df_res['is_delayed'] = tmp_df_res['is_delayed'] == 'Delayed'
tmp_df_res['order_has_installment'] = tmp_df_res['order_has_installment'] == 'Has Installments'

Calculate average MAU by state.

In [None]:
temp_mau = (
    tmp_df_res.groupby(['customer_state_short', pd.Grouper(key='order_purchase_dt', freq='ME')], observed=False)
    .agg(mau = ('customer_unique_id', 'nunique'))
    .groupby('customer_state_short', observed=False)
    .mean()
)

In [None]:
tmp_df_res = (
    tmp_df_res.groupby('customer_state_short', observed=False, as_index=False)
    .agg(
        total_orders=('order_id', 'nunique')
        , total_payment = ('total_payment', 'sum')
        , aov = ('total_payment', 'mean')
        , first_orders_cnt = ('sale_is_customer_first_purchase', 'sum')
        , installment_orders_cnt = ('order_has_installment', 'sum')
        , total_reviews = ('reviews_cnt', 'sum')
        , avg_review_score = ('order_avg_reviews_score', 'mean')
        , avg_delivery_time_days = ('delivery_time_days', 'mean')
        , avg_delivery_delay_days = ('delivery_delay_days', 'mean')
        , avg_installments = ('total_installments_cnt', 'mean')
        , avg_products_cnt = ('products_cnt', 'mean')
        , population = ('population', 'first')
        , avg_order_weight_kg = ('total_weight_kg', 'mean')
        , avg_order_volume_cm3 = ('total_volume_cm3', 'mean')
    )
    .merge(temp_mau, on='customer_state_short', how='left')
)

In [None]:
tmp_df_res = (
    df_orders.assign(
        is_canceled = lambda x: x.order_status=='Canceled'
    )
    .merge(df_customers_origin[['customer_id', 'customer_state_short']], on='customer_id')
    .groupby('customer_state_short', as_index=False)
    .agg(
        cancel_rate = ('is_canceled', 'mean')
    )
    .merge(tmp_df_res, on='customer_state_short', how='right')
)

In [None]:
tmp_df_res['orders_per_thousand_person'] = tmp_df_res['total_orders'] * 1000 / tmp_df_res['population']
tmp_df_res['total_payment_per_thousand_person'] = tmp_df_res['total_payment'] * 1000 / tmp_df_res['population']
tmp_df_res['repeat_purchase_rate'] = (tmp_df_res['total_orders'] - tmp_df_res['first_orders_cnt']) / tmp_df_res['total_orders']
tmp_df_res['installment_orders_rate'] = tmp_df_res['installment_orders_cnt'] / tmp_df_res['total_orders']

In [None]:
del temp_mau

Let's calculate the median retention of the first lifetime by state through cohorts. We will define the period as 30 days.


In [None]:
retention_1st_month_state = (
    df_sales.merge(df_customers_origin[['customer_id', 'customer_state_short']], on='customer_id', how='left')
)

In [None]:
retention_1st_month_state['cohort'] = retention_1st_month_state['customer_first_purchase_dt'].dt.to_period('M')
retention_1st_month_state['lifetime'] = (retention_1st_month_state['order_purchase_dt'] - retention_1st_month_state['customer_first_purchase_dt']).dt.days // 30

In [None]:
retention_1st_month_state = (
    retention_1st_month_state.groupby(['customer_state_short', 'cohort', 'lifetime'])['customer_unique_id']
    .nunique()
    .unstack()
    .fillna(0)
)

In [None]:
retention_1st_month_state = retention_1st_month_state.div(retention_1st_month_state[0], axis=0)[1].reset_index()

In [None]:
retention_1st_month_state = (
    retention_1st_month_state.groupby('customer_state_short', as_index=False, observed=False)
    .median()
)

In [None]:
retention_1st_month_state['retention_1st_month_state'] = retention_1st_month_state[1]

In [None]:
retention_1st_month_state = retention_1st_month_state[['customer_state_short', 'retention_1st_month_state']]

In [None]:
tmp_df_res = tmp_df_res.merge(retention_1st_month_state, on='customer_state_short', how='left')

#### 4.10.2.2 Data Visualization

Add some metrics in labels_for_map

In [None]:
labels_for_map.update(dict(
    total_reviews = 'Number of Reviews'
    , avg_delivery_delay_days = 'Average Delivery Delay, days'
    , orders_per_thousand_person = 'Number of Orders per Thousand Residents'
    , total_payment_per_thousand_person = 'Sales Amount per Thousand Residents, R$'
    , retention_1st_month_state = 'Retention 1st Month by State'
    , customer_state_short = 'Customer State'
))
labels_for_map.update({'mau': 'Average MAU'})

Let’s calculate the centroids of the states.

In [None]:
brazil_states_geojson = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson"
with urlopen(brazil_states_geojson) as response:
    geojson = json.load(response)

gdf = gpd.GeoDataFrame.from_features(geojson['features'])

In [None]:
gdf['centroid'] = gdf['geometry'].centroid
gdf['centroid_lon'] = gdf['centroid'].x
gdf['centroid_lat'] = gdf['centroid'].y

state_centroids = gdf[['sigla', 'centroid_lon', 'centroid_lat']]
state_centroids.head(1)

In [None]:
def plot_map_state(metric: str):
    """Create plotly map by states"""
    title = f"Distribution of {labels_for_map[metric].split(',')[0]} by State"
    colorbar_title = labels_for_map[metric].split(',')[1] if ',' in labels_for_map[metric] else None
    df_for_color_text = tmp_df_res[['customer_state_short', metric]]
    color_level = (tmp_df_res[metric].max() - tmp_df_res[metric].min()) * 0.7 + tmp_df_res[metric].min()
    df_for_color_text['color'] = df_for_color_text[metric].apply(lambda x: 'rgba(255, 255, 255, 0.8)'  if x < color_level else 'rgba(50, 50, 50, 0.8)')

    # print(tmp_df_res)
    hover_data = dict()
    is_percentage = metric in [
        'avg_freight_ratio', 'orders_delayed_share', 
        'installment_orders_rate', 'repeat_purchase_rate', 
        'cancel_rate', 'retention_1st_month_state']
    if metric != 'total_orders':
        hover_data['total_orders'] = ':.2s'
    if not is_percentage:
        hover_data[metric] = ':.2s'
    else:
        if metric == 'retention_1st_month_state':
            hover_data[metric] = ':.2%'
        else:
            hover_data[metric] = ':.1%'

    fig = px.choropleth(
        tmp_df_res,
        geojson=brazil_states_geojson,
        locations='customer_state_short',  
        featureidkey="properties.sigla", 
        color=metric,
        color_continuous_scale="Viridis",
        title=title,
        labels=labels_for_map,
        hover_data=hover_data
    )

    fig.add_trace(
        go.Scattergeo(
            lon=state_centroids['centroid_lon'],
            lat=state_centroids['centroid_lat'],
            text=state_centroids['sigla'],
            hoverinfo='none',
            mode='text',
            textfont=dict(
                color='gray',
                size=10
            ),
            showlegend=False
        )
    )

    color_dict = dict(zip(df_for_color_text['customer_state_short'], df_for_color_text['color']))
    for trace in fig.data:
        if trace.type == 'scattergeo':
            states_in_trace = trace.text
            ordered_colors = [color_dict[state] for state in states_in_trace]
            trace.textfont.color = ordered_colors

    if is_percentage:
        fig.update_coloraxes(
            colorbar_tickformat=".0%" if metric != 'retention_1st_month_state' else ".2%"
        )
        
    fig.update_geos(
        visible=False,
        lataxis_range=[-40.7, 7.3],  
        lonaxis_range=[-85, -34.5],  
        projection_scale=1.2,        
        center=dict(lat=-15, lon=-55)  
    )
    
    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0}
        , width=550
        , height=500
        , coloraxis_colorbar_title_text = colorbar_title
    )    
    pb.to_slide(fig)
    return fig

**In which states is the sales volume higher?**

In [None]:
plot_map_state('total_orders')

**What is the number of orders per resident?**

In [None]:
plot_map_state('orders_per_thousand_person')

**In which states is the sales amount higher?**

In [None]:
plot_map_state('total_payment')

**What is the sales amount per resident?**

In [None]:
plot_map_state('total_payment_per_thousand_person')

**In which states is the average MAU higher?**

In [None]:
plot_map_state('mau')

**What is the average order value higher in which states?**

In [None]:
plot_map_state('aov')

**What is the proportion of repeat purchases by states?**

In [None]:
plot_map_state('repeat_purchase_rate')

**In which states is the number of reviews higher?**

In [None]:
plot_map_state('total_reviews')

**What is the review score higher in which states?**

In [None]:
plot_map_state('avg_review_score')

**What is the average delivery time higher in which states?**

In [None]:
plot_map_state('avg_delivery_time_days')

**What is the average delivery delay higher in which states?**

In [None]:
plot_map_state('avg_delivery_delay_days')

**What is the proportion of canceled orders higher in which states?**

In [None]:
plot_map_state('cancel_rate')

**What is the proportion of orders with installment payments higher in which states?**

In [None]:
plot_map_state('installment_orders_rate')

**What is the number of installments higher in which states?**

In [None]:
plot_map_state('avg_installments')

**What is the number of items per order higher in which states?**

In [None]:
plot_map_state('avg_products_cnt')

**In which states are the heaviest orders?**

In [None]:
plot_map_state('avg_order_weight_kg')

**In which states are the largest volume orders?**

In [None]:
plot_map_state('avg_order_volume_cm3')

**What is the median retention of the first month higher in which states?**

In [None]:
plot_map_state('retention_1st_month_state')

<h2 id="4-11"> 4.11 Cohort Analysis</h2>

We will analyze different metrics by monthly cohorts. 

We will choose a period of 30 days for lifetime, meaning each 30 days, the cohort's lifetime will increase by 1.

**Number of Sales**

In [None]:
fig = df_sales.analysis.cohort(
    mode='sales'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Most purchases occur in the cohort's first month  
- Stabilization after first month without sharp decline  

**Revenue**

Since Olist is a marketplace, it is reasonable to calculate revenue by the total amount of sold products. This is because the revenue for marketplaces typically comes from commissions on sellers.

In [None]:
fig = df_sales.analysis.cohort(
    mode='revenue'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
    , revenue_col='total_products_price'
    , text_auto='.2s'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Revenue follows similar pattern to sales volume  
- Majority of cohort revenue generated in first month  
- No sharp lifetime decline observed  

**Average Order Value**

For calculating the average order value, it is reasonable to take the total payment amount per order. This is because the average order value should reflect the total cost of the order.

In [None]:
fig = df_sales.analysis.cohort(
    mode='aov'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
    , revenue_col='total_payment'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- No post-first-month decline in average order value (relative metric)  
- No clear growth/decline trend across cohort lifetimes  
- Some peak values in isolated periods  
- Median AOV shows slight decline after 11 months  
- Anomalously high AOV:  
  - March 2017 cohort (month 5)  
  - April 2017 cohort (month 15)  

**Number of Buyers**

In [None]:
fig = df_sales.analysis.cohort(
    mode='buyers'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
)
pb.to_slide(fig)
fig.show()

**Key Observations:**  

- Customer count pattern mirrors revenue/sales  
- Nearly all customers don't return after first month  

**Retention**

In [None]:
fig = df_sales.analysis.cohort(
    mode='retention'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
    , include_period0=False
)
pb.to_slide(fig) 
fig.show()

**Key Observations:**  

- Extremely low 1st+ month retention  
- Olist shows very poor customer retention  

Let's look at the median retention of cohorts by periods.

In [None]:
fig = df_sales.analysis.cohort(
    mode='retention'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
    , display_mode='summary'
)
pb.to_slide(fig) 
fig.show()

**Key Observations:**  

- Clear pattern: minimal customers return for repeat purchases  

**Average Payment Count**

In [None]:
fig = df_sales.analysis.cohort(
    mode='apc'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
) 
pb.to_slide(fig) 
fig.show()

**Key Observations:**  

- Customers typically make 1-1.3 purchases per period  

**ARPPU**

For ARPPU, it is reasonable to take the total cost of products in the order, as we need the revenue per customer.

In [None]:
fig = df_sales.analysis.cohort(
    mode='arppu'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
    , revenue_col='total_products_price'
) 
pb.to_slide(fig) 
fig.show()

**Key Observations:**  

- Pattern similar to average order value  
- ARPPU resembles AOV since most customers make single purchases  

**LTV (Revenue-Based)**

Since we do not have data on expenses or the margin coefficient, we cannot calculate the actual LTV. 

We will calculate LTV (Revenue-Based), assuming a margin coefficient of 1. This will serve as a proxy for LTV.

In [None]:
fig = df_sales.analysis.cohort(
    mode='ltv'
    , user_id_col='customer_unique_id'
    , date_col='order_purchase_dt'
    , order_id_col='order_id'
    , margin=1
) 
pb.to_slide(fig) 
fig.show()

**Key Observations:**  

- Revenue-Based LTV remains stable across lifetime periods  
- Expected pattern since most purchases occur in first month  
- LTV reflects cumulative lifetime performance  

<h2 id="4-12"> 4.12 Correlation Analysis</h2>

Here we will build a correlation matrix for all numerical variables. 

We will also create scatter plots for those with a correlation coefficient greater than 0.3. 

Additionally, we can create scatter plots for specific pairs even if the correlation coefficient is low, as we are focused on the relationship rather than the correlation coefficient itself.

### 4.12.1 Table df_sales

We will create labels for column names.

In [None]:
labels = {
    'from_purchase_to_approved_hours': 'Purchase To Approved',
    'from_purchase_to_carrier_days': 'Purchase To Carrier',
    'delivery_time_days': 'Delivery Time',
    'delivery_time_estimated_days': 'Estimated Delivery',
    'delivery_delay_days': 'Delivery Delay',
    'from_approved_to_carrier_days': 'Approved To Carrier',
    'from_carrier_to_customer_days': 'Carrier To Customer',
    'payments_cnt': 'Payments Cnt',
    'total_payment': 'Total Payment',
    'avg_payment': 'Avg Payment',
    'total_installments_cnt': 'Total Installments',
    'products_cnt': 'Products Cnt',
    'unique_products_cnt': 'Unique Products Cnt',
    'sellers_cnt': 'Sellers Cnt',
    'product_categories_cnt': 'Categories Cnt',
    'total_products_price': 'Total Products Price',
    'avg_products_price': 'Avg Product Price',
    'total_freight_value': 'Total Freight Value',
    'total_order_price': 'Total Order Price',
    'total_weight_kg': 'Total Weight',
    'total_volume_cm3': 'Total Volume',
    'avg_distance_km': 'Avg Distance',
    'avg_carrier_delivery_delay_days': 'Avg Carrier Delay',
    'freight_ratio': 'Freight Ratio',
    'reviews_cnt': 'Reviews Cnt',
    'order_avg_reviews_score': 'Avg Review Score'
}

Let's look at the correlation between the metrics.

In [None]:
df_sales.analysis.corr_matrix(labels=labels)

**Key Observations:**
- There is a moderate positive correlation (0.4) between the total delivery time and the delivery time to the carrier. Therefore, the delivery time to the carrier affects the total delivery time, which is logical.
- There is a moderate positive correlation (0.6) between the total delivery time and the delivery time from the carrier. The delivery time from the carrier has a higher correlation with the total delivery time compared to the delivery time to the carrier. This suggests that the delivery time from the carrier affects the total delivery time more significantly.
- There is a moderate positive correlation (0.5) between the total number of products in an order and the number of unique products in an order. This indicates that the total number of products in an order often increases due to the addition of unique products rather than by increasing the quantity of already existing products in the order.
- There is also a moderate positive correlation (0.6) between the number of sellers in an order and the number of unique products in an order. This means that if customers buy more unique products, they are likely to buy from different sellers.
- There is also a moderate positive correlation (0.6) between the number of sellers in an order and the number of categories in an order. This indicates that if customers buy products from more categories, they are likely to buy from different sellers.
- There is a moderate positive correlation (0.5) between the delivery cost of an order and the order amount. This means that the more expensive the order, the higher the delivery cost, which is logical.
- There is a moderate positive correlation (0.6) between the delivery cost of an order and the weight/volume of the order. This suggests that the heavier or larger the order, the higher the delivery cost.
- There is a strong positive correlation (0.8) between the volume and weight of the order. This indicates that the heavier the order, the larger it is usually.


**Does the order cost affect the delivery cost?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_payment', 'total_freight_value']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  


- Higher order value → higher shipping cost  

**Does the order weight affect the delivery cost?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_weight_kg', 'total_freight_value']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Heavier orders → higher shipping cost  

---

**Does the order volume affect the delivery cost?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_volume_cm3', 'total_freight_value']
    , transforms='log'
    , labels=labels
)


**Key Observations:**  

- Larger volume → higher shipping cost  

---

**Does the distance affect the delivery cost?**

In [None]:
df_sales.viz.pairplot(
    pairs=['avg_distance_km', 'total_freight_value']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Greater distance → higher shipping cost  

---

**Does the order cost affect the delivery time?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_payment', 'delivery_time_days']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Order value has minimal impact on delivery time  

**Does the order weight affect the delivery time?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_weight_kg', 'delivery_time_days']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Weight has minimal impact on delivery time  

---

**Does the order volume affect the delivery time?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_volume_cm3', 'delivery_time_days']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Volume doesn't affect delivery time  

---

**Does the distance affect the delivery time?**

In [None]:
df_sales.viz.pairplot(
    pairs=['avg_distance_km', 'delivery_time_days']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Greater distance → longer delivery time  

---

**Is the heavier the order, the larger its volume?**

In [None]:
df_sales.viz.pairplot(
    pairs=['total_weight_kg', 'total_volume_cm3']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- Heavier items tend to be larger  

### 4.12.2 Table df_customers

We will create labels for column names.

In [None]:
labels = {
    'orders_cnt': 'Total Orders',
    'canceled_share': 'Canceled Share',
    'canceled_orders_cnt': 'Canceled Orders',
    'not_delivered_share': 'Not Delivered Share',
    'customer_issue_share': 'Customer Issue Share',
    'service_issue_share': 'Service Issue Share',
    'buys_cnt': 'Total Purchases',
    'avg_delivery_time_days': 'Avg Delivery Time',
    'avg_delivery_delay_days': 'Avg Delivery Delay',
    'delayed_orders_share': 'Delayed Orders Share',
    'purchase_weekend_share': 'Weekend Purchase Share',
    'repeat_purchase_share': 'Repeat Purchase Share',
    'avg_payments_cnt': 'Avg Payments Cnt',
    'total_customer_payment': 'Total Payment',
    'avg_total_order_payment': 'Avg Order Payment',
    'avg_individual_payment': 'Avg Individual Payment',
    'installment_orders_share': 'Installment Orders Share',
    'avg_products_cnt': 'Avg Products Cnt',
    'avg_unique_products_cnt': 'Avg Unique Products',
    'avg_sellers_cnt': 'Avg Sellers Cnt',
    'avg_order_total_products_price': 'Avg Order Products Price',
    'avg_total_order_price': 'Avg Total Order Price',
    'avg_products_price': 'Avg Product Price',
    'total_products_price': 'Total Products Price',
    'avg_order_total_freight_value': 'Avg Order Freight Value',
    'avg_order_total_weight_kg': 'Avg Order Weight',
    'avg_order_total_volume_cm3': 'Avg Order Volume',
    'free_shipping_share': 'Free Shipping Share',
    'reviews_cnt': 'Reviews Cnt',
    'customer_avg_reviews_score': 'Avg Review Score',
    'avg_distance_km': 'Avg Distance',
    'avg_buys_diff_days': 'Days Between Purchases',
    'months_with_buys': 'Active Months',
    'max_consecutive_months_with_buys': 'Max Consecutive Active Months'
}

Let's look at the correlation between the metrics.

In [None]:
exclude_cols = ['lat_customer', 'lng_customer', 'customer_zip_code_prefix_3_digits', 'population', 'customer_zip_code_prefix']
(
    df_customers.drop(columns=exclude_cols)
    .analysis.corr_matrix(text_size=10, labels=labels)
)

**Key Observations:**

- There is a high positive correlation (0.8) between the number of canceled orders and the proportion of issues due to the customer. This is logical, as the Canceled status is included in the proportion of customer issues.
- There is a high positive correlation (0.9) between the number of orders and the number of reviews. Therefore, the more orders a customer makes, the more reviews they leave.


**Are there "loyal" customers with high activity?**

In [None]:
df_customers.viz.pairplot(
    pairs=['max_consecutive_months_with_buys', 'orders_cnt']
    , labels=labels
)

**Key Observations:**  

- More active months → more orders  

---

**If customers buy many unique products, are the orders from different sellers?**

In [None]:
df_customers.viz.pairplot(
    pairs=['avg_unique_products_cnt', 'avg_sellers_cnt']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- The higher the average number of unique products in an order for a customer, the higher the average number of sellers. This means that different products are more often bought from different sellers.


---

**Are active customers more likely to write reviews?**

In [None]:
df_customers.viz.pairplot(
    pairs=['reviews_cnt', 'orders_cnt']
    , transforms='log'
    , labels=labels
)

**Key Observations:**  

- More purchases → more reviews left  

<h2 id="4-13"> 4.13 Slice Analysis</h2>

### 4.13.1 Black Friday

Let's examine Black Friday, specifically November 24, 2017. 

We will consider November 23-25 to account for one day before and one day after.


In [None]:
mask = df_orders.order_purchase_dt.between('2017-11-23', '2017-11-26')
df_black_orders = df_orders[mask]
mask = df_sales.order_purchase_dt.between('2017-11-23', '2017-11-26')
df_black_sales = df_sales[mask]

#### 4.13.1.1 Number of Sales

**By Hour**

In [None]:
pb.configure(
    df = df_black_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'order_id'
    , metric_label = 'Number of Sales'
    , agg_func = 'nunique'
    , freq = 'h'
)
pb.line_resample()

**Key Observations:**  

- Sales grew from 5AM-10AM (first peak)  
- Second peak at 1PM  
- Decline until 6PM  
- Evening growth until 10PM peak  
- Anomalous spike at midnight Nov 24 (Black Friday start)  

In [None]:
pb.configure(
    df = df_black_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'order_id'
    , metric_label = 'Share of Sales'
    , agg_func = 'nunique'
    , freq = 'h'
    , norm_by='all'
    , axis_sort_order='descending'    
    , text_auto='.1%'
    , update_fig={'xaxis': {'tickformat': '.0%'}}
)

**By Payment Category**

In [None]:
pb.bar_groupby(y='order_total_payment_cat', to_slide='_black_friday')

**Key Observations:**  

- 64% of orders are medium-priced  

**By Order Weight Category**

In [None]:
pb.bar_groupby(y='order_total_weight_cat', to_slide='_black_friday')

**Key Observations:**  

- 52% medium weight, 36% light  

**By Delivery Time Category**

In [None]:
pb.bar_groupby(y='delivery_time_days_cat', to_slide='_black_friday')

**Key Observations:**  

- Delivery time categories:  
  - Long: 49%  
  - Medium: 46%  
  - Fast: 6%  
- Black Friday increased delivery times  

**By Presence of Installment Payments**

In [None]:
pb.bar_groupby(y='order_has_installment', to_slide='_black_friday')

**Key Observations:**  

- 58% of orders used installments  

**By Payment Type**

In [None]:
pb.bar_groupby(y='order_payment_types', to_slide='_black_friday')

**Key Observations:**  

- Payment methods:  
  - Credit card: 79%  
  - Boleto: 18%  

**By Product Category**

In [None]:
pb.bar_groupby(
    y='order_product_categories'
    , to_slide='_black_friday'
)

**Key Observations:**  

- The majority of orders consisted of products from the Bed Bath Table category (13%).

**By Generalized Product Category**

In [None]:
pb.bar_groupby(y='order_general_product_categories', to_slide='_black_friday')

**Key Observations:**  

- Top generalized categories:  
  - Bed Bath Table: 25%  
  - Furniture: 23%  

**By Review Score**

In [None]:
pb.bar_groupby(y='order_avg_reviews_score', to_slide='_black_friday')

**Key Observations:**  

- Review scores:  
  - 5 stars: 49%  
  - 1 star: 17.5%  
  - 3 stars: 10.7%  
  - 2 stars: 4.3%  

**By Top Customer States**

In [None]:
pb.bar_groupby(y='customer_state', to_slide='_black_friday')

**Key Observations:**  

- Sales by state:  
  - São Paulo: 37%  
  - Rio/Minas: 14% each  
  - Others: ≤6%  

**By Top Customer Cities**

In [None]:
pb.bar_groupby(y='customer_city', to_slide='_black_friday')

**Key Observations:**  

- Sales by city:  
  - São Paulo: 14%  
  - Rio: 8%  
  - Others: ≤3%  

#### 4.13.1.2 Sum of Sales

**By Hour**

In [None]:
pb.configure(
    df = df_black_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_payment'
    , metric_label = 'Sales Amount, R$'
    , title_base = 'Sales Amount'
    , agg_func = 'sum'
    , freq = 'h'
)
pb.line_resample(to_slide='_black_friday')

**Key Observations:**  

- Revenue peaked 10-11AM, declined until 6PM, then rose without surpassing morning peak  

#### 4.13.1.3 Average Order Value

**By Hour**

In [None]:
pb.configure(
    df = df_black_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_payment'
    , metric_label = 'Average Order Value, R$'
    , metric_label_for_distribution = 'Order Value, R$'
    , title_base = 'Average Order Value'
    , agg_func = 'mean'
    , freq='h'
)
pb.line_resample(to_slide='_black_friday')

**Key Observations:**

- There is an anomalous hour (4 AM) on November 25, 2017, in the average order value.
- It is not clear that the average order value was higher on Black Friday compared to the neighboring days.

In [None]:
pb.configure(
    df = df_black_sales
    , metric = 'total_payment'
    , metric_label = 'Average Order Value, R$'
    , metric_label_for_distribution = 'Order Value, R$'
    , agg_func = 'mean'
    , title_base = 'Average Order Value and Number of Sales'
    , axis_sort_order='descending'
    , text_auto='.0f'
)

Top Orders.

In [None]:
pb.metric_top()

Let’s see at statistics and distribution of the metric.

In [None]:
pb.metric_info(
    labels=dict(total_payment='Order Value, R$')
    , title='Distribution of Order Value'
    , upper_quantile=0.95
    , hist_mode='dual_hist_trim'
)

**Key Observations:**  

- 75% of Black Friday orders ≤170 R$ (many >1000 R$ outliers)  

**By Top Customer States**

In [None]:
pb.bar_groupby(
    y='customer_state'
    , show_count=True
).show()

**Key Observations:**  

- Highest order values in:  
  1. Espírito Santo  
  2. Mato Grosso  
  3. Minas Gerais  

#### 4.13.1.4 Number of Customers

**By Hour**

In [None]:
pb.configure(
    df = df_black_sales
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'customer_unique_id'
    , metric_label = 'Number of Customers'
    , agg_func = 'nunique'
    , freq = 'h'
)
pb.line_resample(to_slide='_black_friday')

**Key Observations:**  

- Customer count trend matches order count (mostly single orders)  


There is no sense in looking at it by segments, as customers primarily made only one order, and the results would be similar to the order count.


#### 4.13.1.5 Share of Unavailable Orders

In [None]:
pb.configure(
    time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'target_share'
    , metric_label = 'Share of Unavailable Orders'
    , freq='ME'
)

In [None]:
tmp_tmp_df_res = df_black_orders['order_status'].preproc.calc_target_category_share(
    target_category='Unavailable'
    , group_columns=['order_purchase_dt']
    , resample_freq = 'h'
)

In [None]:
pb.line(
    data_frame=tmp_tmp_df_res
)

**Key Observations:**  

- The proportion of orders with the status unavailable has spikes at 0 AM and 10 AM on November 23, 2017, and at 3 AM on November 24, 2017. There is also a very strong spike on November 26, 2017.  
- It is not clear that there was a shortage of products specifically on Black Friday.


#### 4.13.1.6 Reviews Score

In [None]:
pb.configure(
    df = df_black_sales
    , metric = 'order_avg_reviews_score'
    , metric_label = 'Average Order Reviews Score'
    , metric_label_for_distribution = 'Order Reviews Score'
)

In [None]:
pb.metric_info(
    labels=dict(order_avg_reviews_score='Order Reviews Score')
    , title='Distribution of Order Reviews Score'
    , xaxis_type='category'
)

**Key Observations:**  

- 49% of Nov 23-25 orders had 5-star reviews  

### 4.13.2 Canceled Orders

Analyze canceled orders.

In [None]:
df_canceled = df_orders[df_orders.order_status=='Canceled']

#### 4.13.2.1 Number of Orders

**By month**

In [None]:
pb.configure(
    df = df_canceled
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'order_id'
    , metric_label = 'Number of Canceled Orders'
    , agg_func = 'nunique'
    , freq = 'ME'
)
pb.line_resample()

**Key Observations:**  

- Typically 20-40 canceled orders/month  
- Anomalous spikes in Feb/Aug 2018  

In [None]:
pb.configure(
    df = df_canceled
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'order_id'
    , metric_label = 'Share of Canceled Orders'
    , agg_func = 'nunique'
    , freq = 'h'
    , norm_by='all'
    , axis_sort_order='descending'    
    , text_auto='.1%'
    , update_fig={'xaxis': {'tickformat': '.0%'}}
)

**By Payment Category**

In [None]:
pb.bar_groupby(y='order_total_payment_cat')

**Key Observations:**  

- Canceled orders:  
  - Medium price: 55%  
  - Expensive: 28%  

**By Order Weight Category**

In [None]:
pb.bar_groupby(y='order_total_weight_cat')

**Key Observations:**  

- Canceled order weights:  
  - Medium: 36%  
  - Light: 31%  


**By Presence of Installment Payments**

In [None]:
pb.bar_groupby(y='order_has_installment')

**Key Observations:**  

- 51% of canceled orders didn't use installments  

**By Payment Type**

In [None]:
pb.bar_groupby(y='order_payment_types')

**Key Observations:**  

- Canceled order payments:  
  - Credit card: 70%  
  - Boleto: 16%  
  - Voucher: 10% (higher than overall)  

**By Product Category**

In [None]:
pb.bar_groupby(
    y='order_product_categories'
    , text_auto=False
)

**Key Observations:**  

- Top canceled order categories:  
  1. Bed Bath Table  
  2. Health Beauty  
  3. Sports Leisure  

**By Generalized Product Category**

In [None]:
pb.bar_groupby(
    y='order_general_product_categories'
    , text_auto=False
)

**Key Observations:**  

- Top generalized canceled categories:  
  1. Electronics  
  2. Furniture  
  3. Home & Garden  

**By Top Customer States**

In [None]:
pb.bar_groupby(y='customer_state', text_auto=False)

**Key Observations:**  

- Canceled orders by state:  
  - São Paulo: 42%  
  - Rio: 13%  
  - Minas: 12%  
  - Others: ≤6%  

#### 4.13.2.2 Sum of Orders

**By month**

In [None]:
pb.configure(
    df = df_canceled
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_payment'
    , metric_label = 'Canceled Orders Amount, R$'
    , title_base = 'Canceled Orders Amount'
    , agg_func = 'sum'
    , freq = 'ME'
)
pb.line_resample()

**Key Observations:**  

- Canceled order revenue typically 4K-8K R$/month  
- Anomalous spikes in Jul/Aug 2018  

#### 4.13.2.3 Average Order Value

**By month**

In [None]:
pb.configure(
    df = df_canceled
    , time_column = 'order_purchase_dt'
    , time_column_label = 'Date'
    , metric = 'total_payment'
    , metric_label = 'AOV (Canceled), R$'
    , title_base = 'AOV (Canceled)'
    , agg_func = 'mean'
    , freq = 'ME'
)
pb.line_resample()

**Key Observations:**  

- Canceled order value spikes in Apr 2017 and Jul 2018  

<h2 id="4-14"> 4.14 Analysis of Customer Segments</h2>

In [None]:
pb.configure(
    df = df_customers
    , metric = 'customer_unique_id'
    , metric_label = 'Share of Customers'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'    
    , text_auto='.1%'
)

Save customer metrics in a separate list.

In [None]:
customers_dim = [
    "activity_segment"
    , "value_segment"
    , "purchase_freq_segment"
    , "repeat_segment"
    , "loyalty_segment"
    , "risk_segment"
    , "weekday_segment"
    , "installment_segment"
    , "products_cnt_segment"
    , "weight_segment"
    , "customer_top_purchase_weekdays"
    , "customer_payment_types"
    , "customer_top_product_categories"
    , "customer_top_general_product_categories"
    , "customer_city"
    , "customer_state"
]

### 4.14.1 Distribution of Customers by Segments

Examine how customers are distributed across each segment and compare key metrics between segments.

Select the following key customer metrics.

- total_customer_payment
- avg_total_order_payment 
- buys_cnt
- from_first_to_last_days
- customer_avg_reviews_score
- avg_products_cnt
- avg_delivery_delay_days
- avg_order_total_weight_kg

In [None]:
selected_metrics = [
    'total_customer_payment',
    'avg_total_order_payment', 
    'buys_cnt',
    'from_first_to_last_days',
    'customer_avg_reviews_score',
    'canceled_share',
    'purchase_weekend_share',
    'avg_products_cnt',
    'avg_delivery_delay_days',
    'avg_order_total_weight_kg'
]

Give more readable names for the metrics on the graphs.

In [None]:
metric_labels = {
    'total_customer_payment': 'Total Spending',
    'avg_total_order_payment': 'Average Order Value',
    'buys_cnt': 'Number of Purchases',
    'from_first_to_last_days': 'Customer Lifetime',
    'customer_avg_reviews_score': 'Average Rating',
    'canceled_share': 'Order Cancelation Rate',
    'purchase_weekend_share': 'Weekend Purchase Ratio',
    'avg_products_cnt': 'Average Items per Order',
    'avg_delivery_delay_days': 'Avg Delivery Delay',
    'avg_order_total_weight_kg': 'Avg Order Weight'
}

In [None]:
labels_for_polar={**base_labels, **metric_labels}

**By Activity Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='activity_segment'
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='activity_segment'
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- 3% of all customers made no successful purchases  
- 94% of successful customers made only one purchase  
- 1% in Potential Core segment  
- 1% in Short-Lived Repeat segment  
- Core audience segment is less than 1%  
- Highest metric values in Core segment, followed by Potential Core  
- Median review score is higher for one-time purchasers  
- Core segment has best delivery time performance, One Time has worst  

We will not consider the segment of customers who did not make any successful purchases, as their values will be repetitive.

**By Purchase Amount Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='value_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='value_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- 49% of customers are in medium payment tier  
- 24% in high payment tier, 24% in low  
- High payment tier spends most (expected)  
- No difference in median review scores across tiers  
- High payment tier has higher median order weight  

**By Purchase Frequency Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='purchase_freq_segment'
    , exclude_segments=['Never Converted', 'Non-Repeating']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='purchase_freq_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Among repeat buyers:  
  - Weekly purchasers: 1% (most common frequency)  
  - Quarterly/Semi-annual buyers show better metrics than other frequencies  

**By Time to Next Purchase Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='repeat_segment'
    , exclude_segments=['Never Converted', 'Non-Repeating']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='repeat_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Among repeat buyers, medium repurchase time segment is smallest (<1%)  
- Fast repurchase segment shows worse metrics than medium/slow segments  

**By Loyalty Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='loyalty_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='loyalty_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Loyalty segments:  
  - Promoters: 58%  
  - Critics: 13% (lowest)  
- Critics have:  
  - Higher total payment and AOV than promoters/neutrals  
  - Shortest time between first/last purchase (rarely return)  
  - Heavier average orders  
  - Worst median delivery time performance  

**By Risk Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='risk_segment'
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='risk_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- 99.5% of customers are "Reliable" (no order cancellations)  
- Cancellation segment has:  
  - Much shorter time between first/second purchase  
  - Higher median total spend and AOV  

**By Day of the Week Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='weekday_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='weekday_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- 75% of customers only purchased on weekdays  
- Weekend purchasers have significantly longer time between first/last purchase  

**By Installment Payment Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='installment_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='installment_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- 50% used installments at least once  
- 47% always paid in full  
- Installment users have significantly higher:  
  - Median total spend  
  - AOV  
  - Order weight  
  - Time between first/last purchase  

**By Average Number of Products per Order Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='products_cnt_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='products_cnt_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- 88% of customers had ≤1 product per order  
- 8% averaged 1-2 products  
- Only 2% averaged >2 products  
- Customers with 2+ products per order have significantly higher:  
  - Median order weight  
  - Total spend  
  - AOV  
  - Time between first/last purchase  

**By Average Weight of Order Segment**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='weight_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='weight_segment'
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Order weight segments:  
  - Light: 39%  
  - Medium: 37%  
  - Heavy: 21%  
- Heavy segment has significantly higher total spend and AOV  
- Light segment has shorter time between first/last purchase  

**By Top Days of the Week**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='customer_top_purchase_weekdays'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='customer_top_purchase_weekdays'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Most customers only purchased on one weekday (expected due to low repeat purchases)  
- Top 3 purchase days: Monday, Tuesday, Wednesday  
- Monday-only buyers have longer time between first/last purchase than other top segments (possibly coincidental)  

**By Top Payment Types**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='customer_payment_types'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id'
    , labels=labels_for_polar
    , text_auto=True
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='customer_payment_types'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Payment methods:  
  - Credit card only: 73%  
  - Boleto only: 19%  
- Voucher-only segment has lower total spend and AOV than other top payment segments  


**By Top Product Categories**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='customer_top_product_categories'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='customer_top_product_categories'
    , max_segments=5
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Most customers only bought from:  
  - Bed Bath Table  
  - Health Beauty categories  
- Sports goods buyers have longer time between first/last purchase than other category segments  


**By Top General Product Categories**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='customer_top_general_product_categories'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='customer_top_general_product_categories'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Top generalized category segments:  
  - Electronics only: 26%  
  - Furniture only: 17%  
  - Home & Garden only: 14%  

**By Customer State**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='customer_state'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='customer_state'
    , max_segments=5
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Customer distribution by state:  
  - São Paulo: 42%  
  - Rio de Janeiro: 13%  
  - Minas Gerais: 12%  

**By Customer City**

In [None]:
fig = df_customers.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='customer_city'
    , exclude_segments=['Never Converted']
    , max_segments=5
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig)

In [None]:
df_customers.analysis.segment_table(
    metrics=selected_metrics
    , dimension='customer_city'
    , max_segments=5
    , exclude_segments=['Never Converted']
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**  

- Customer distribution by city:  
  - São Paulo: 16%  
  - Rio de Janeiro: 7%  

### 4.14.2 Customer Profiling

**By Purchase Frequency & Loyalty**

- **One-Time Buyers (94%):**
  - Single purchase only
  - Low engagement (short time between purchases)
  
- **Potential Core (1%):**
  - Potentially loyal but not yet core
  - Strong metrics (second only to Core)

- **Core (<1%):**
  - Loyalty core: highest spending, best metrics

- **Short-Lived Repeat (1%):**
  - Short-term loyalty 

**Recommendations:**

1. Convert One-Time to Potential Core:
   - Launch loyalty programs
   - Personalized offers based on first purchase
2. Retain Core customers:
   - Premium service tier
   - Exclusive early access to sales

---

**By Payment Amount**

- **High-Spend (24%):**
  - Large orders
  - Heavy items 
  - Critical risk

- **Medium-Spend (49%):**
  - Stable base 
  - Balanced metrics

- **Low-Spend (24%):**
  - Small orders
  - Likely trial purchases

**Recommendations:**
1. For High-Spend:
   - Improve delivery (current avg. 18 days)
   - Dedicated account managers
2. For Low-Spend:
   - Cross-sell bundles (+15% discount)
   - "Complete your set" prompts

---

**By Repurchase Timing**

- **Fast Repeat (<1%):**
  - Quick repurchase 
  - Low satisfaction 

- **Seasonal (1%):**
  - Quarterly/semi-annual purchases
  - High value 

**Recommendations:**
1. For Fast Repeat:
   - Post-purchase follow-ups
   - Satisfaction surveys
2. For Seasonal:
   - Pre-season reminders
   - "Back in stock" alerts

---

**By Loyalty**

- **Promoters (58%):**
  - High ratings (4-5 stars)
  - Low retention (94% one-time)

- **Critics (13%):**
  - High spenders 
  - Fast churn 

**Recommendations:**
1. For Promoters:
   - "Refer a friend" bonuses
   - Repeat purchase incentives
2. For Critics:
   - Logistics improvements
   - VIP complaint resolution

---

**Behavioral Patterns**

- Customers who made purchases not only on weekends (25%):
  - More loyal (longer time between purchases).
- Customers who use installment payments (50%):
  - Higher order amounts, longer customer lifetime — "serious" customers.
- Customers with 2+ products in an order (2%):
  - Key for revenue (high metrics).
- Customers who use only a voucher:
  - Have lower total purchase amounts and average order amounts.


**Recommendations:**

- Installment campaigns:
  - "0% interest for 3 months"
- Multi-item incentives:
  - "Free shipping on 3+ items"
- Voucher users:
  - Upsell to credit card payments

---

**By Geographic**

- **São Paulo (42%):**
  - Electronics/Furniture focus
  - 18% faster delivery than average

- **Rio de Janeiro (13%):**
  - High Fashion/Beauty demand
  - 22% installment adoption

**Recommendations:**

1. Localized campaigns:
   - "SP Furniture Week" discounts
   - "Rio Beauty Box" bundles
2. Warehouse optimization:
   - Strategic stock placement
   - Regional delivery hubs

### 4.14.3 Pairwise Segment Combinations

Examine the distribution of customers across combinations of 2 segments. 

We will exclude the non-converted segment from the analysis.

In [None]:
pb.configure(
    df = df_customers[df_customers.buys_cnt.notna()]
    , metric = 'customer_unique_id'
    , metric_label = 'Share of Customers'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'    
    , text_auto='.1%'
)

**loyalty_segment and value_segment**

In [None]:
pb.cat_compare(
    cat1='loyalty_segment'
    , cat2 = 'value_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- The medium payment tier dominates across all loyalty segments.
- Promoters are the majority in all payment tiers.
- Critics stand out noticeably in the high payment tier segment.

**purchase_freq_segment and value_segment**

In [None]:
pb.cat_compare(
    cat1='purchase_freq_segment'
    , cat2 = 'value_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- The low Value segment has a significantly higher proportion of non-repeat purchasers (logical since they don't make repeat purchases).
- In the high Value segment, the weekly purchase frequency segment underperforms - meaning fewer purchases occurred weekly.

**activity_segment and repeat_segment**

In [None]:
pb.cat_compare(
    cat1='activity_segment'
    , cat2 = 'repeat_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**  

- The slow repeat segment is clearly highlighted in potential core, meaning they have a long time between repeat purchases. The same pattern is present in the core segment, but it is less pronounced.


**loyalty_segment and risk_segment**

In [None]:
pb.cat_compare(
    cat1='loyalty_segment'
    , cat2 = 'risk_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- The potential core cohort clearly highlights the slow repeat segment, indicating a longer time period before repeat purchases. In the core cohort, the same pattern is observed, but it is less pronounced.

**customer_top_general_product_categories and value_segment**

In [None]:

pb.cat_compare(
    cat1='customer_top_general_product_categories'
    , cat2 = 'value_segment'
    , trim_top_n_cat1=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Electronics dominate purchases in the low Value segment, while medium Value segments show noticeably fewer electronics purchases.

**weight_segment and customer_state**

In [None]:
pb.cat_compare(
    cat1='weight_segment'
    , cat2 = 'customer_state' 
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- São Paulo has more light-weight orders while Rio de Janeiro has more heavy-weight orders.


**weekday_segment and activity_segment**

In [None]:
pb.cat_compare(
    cat1='weekday_segment'
    , cat2 = 'activity_segment' 
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Core and potential core segments contain more customers who shop beyond just weekdays, while one-time purchasers predominantly shop on weekdays.


**products_cnt_segment and loyalty_segment**

In [None]:
pb.cat_compare(
    cat1='products_cnt_segment'
    , cat2 = 'loyalty_segment' 
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Single-product orders dominate among promoters, while critics tend to have more 2+ product orders.

**installment_segment and repeat_segment**

In [None]:
pb.cat_compare(
    cat1='installment_segment'
    , cat2 = 'repeat_segment' 
    , visible_graphs = [2, 3]
)

**Key Observations:**

- The installment segment contains more customers with longer periods between repeat purchases.
- The non-installment segment shows:
  - Lower proportion of long repeat purchase cycles
  - Dominance of one-time purchasers

<h2 id="4-15"> 4.15 RFM Analysis</h2>

In [None]:
rfm_res = df_sales.analysis.rfm(
    user_id_col='customer_unique_id'
    , order_id_col='order_id'
    , date_col='order_purchase_dt'
    , revenue_col='total_payment'
    , upper_quantile=0.99
    , return_rfm=True
)

Let’s examine the distributions of Recency, Frequency, and Monetary.

In [None]:
rfm_res['hist']

Let’s look at the RFM heatmap where color represents Monetary.

In [None]:
fig = rfm_res['heat']
pb.to_slide(fig)
fig.show()

**Key Observations:**

- The FR segment 33 generates the highest payments - frequent recent buyers.

Let’s examine the distribution of customers by RFM pairwise combinations.

In [None]:
fig = rfm_res['heat_pairs']
pb.to_slide(fig)
fig.show()

**Key Observations:**

- The FM segment 33 clearly stands out in terms of customer count - frequent high-value buyers.


Let’s examine slices of the FM pair by R.

In [None]:
fig = rfm_res['heat_sliced']
pb.to_slide(fig)
fig.show()

**Key Observations:**

- The FM33 segment contains the most R=3 customers - frequent recent high-value buyers.


Let’s look at the distribution by segments.

In [None]:
fig_tree = rfm_res['seg_tree']
fig_bar = rfm_res['seg_bar']
pb.to_slide(fig_tree, '_treemap')
pb.to_slide(fig_bar, '_bar')
fig_tree.show()
fig_bar.show()

**Key Observations:**

- Largest segments: Hibernating (22%), About to Sleep (18%), Promising (14%) - mostly inactive customers.
- Champions: 4%, Loyal: 7%, Lost: 4%.

Save the RFM dataframe for clustering.

In [None]:
df_rfm = rfm_res['df_rfm']
del rfm_res

<h2 id="4-16"> 4.16 Customer Clustering</h2>

### 4.16.1 All Customers

#### 4.16.1.1 Cluster Definition

- For clustering, we will use RFM metrics and the average number of unique products per order.
- RFM metrics are well-suited for clustering. Adding the average number of unique products will help account for assortment demand diversity.

In [None]:
selected_metrics = [
    'avg_unique_products_cnt'
]

Create a dataframe with the selected metrics.

We will only consider customers who have made at least one successful purchase.

In [None]:
mask = df_customers.buys_cnt.notna()

In [None]:
cols_to_drop = ['recency_score', 'frequency_score', 'monetary_score', 'rfm_score', 'rfm_segment']
df_processed = (
    df_customers.loc[mask, ['customer_unique_id', *selected_metrics]]
    .merge(df_rfm, on='customer_unique_id', how='left')
    .drop(cols_to_drop, axis=1)
    .set_index('customer_unique_id')
)

Check for highly correlated features.

In [None]:
corr_matrix = df_processed.corr().abs()

In [None]:
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [None]:
cols_to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.6)]
cols_to_drop

No need to delete anything. 

Look for missing values in the columns.

In [None]:
df_processed.isna().sum().nlargest(5)

There are very few missing values, and they relate to customers who made only one purchase and have not yet received their item. 

We will remove these rows before standardization.

In [None]:
scaler = StandardScaler()
df_processed = df_processed.dropna()
X_scaled = scaler.fit_transform(df_processed)

Determine the optimal number of clusters using the elbow method and silhouette analysis.

In [None]:
wcss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

px.line(
    x=range(1, 15)
    , y=wcss
    , labels={'x': 'Number of clusters', 'y': 'WCSS'}
    , title='The Elbow Method'
    , width=600
    , height=400
)

**Key Observations:**

- The elbow method shows a clear break at 5 clusters. We'll use 5 clusters.

In [None]:
optimal_clusters = 5
kmeans = KMeans(
    n_clusters=optimal_clusters
    , init='k-means++'
    , random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

Examine quality

In [None]:
score = silhouette_score(X_scaled, cluster_labels)
print(f'Silhouette Score: {score:.3f}')

**Key Observations:**

- Silhouette score of 0.492 indicates good cluster separation.

Add cluster labels to the dataframe.

In [None]:
df_processed['cluster'] = cluster_labels + 1

#### 4.16.1.2 Cluster Analysis

Analyze the resulting clusters.

In [None]:
df_processed = df_processed.reset_index()

**Distribution by Clustering Metrics**

In [None]:
selected_metrics = [
    'avg_unique_products_cnt',
    'recency', 
    'frequency',
    'monetary',
]

Provide more readable names for the metrics on the graphs.

In [None]:
metric_labels = {
    'avg_unique_products_cnt': 'Avg Unique Products',
    'recency': 'Recency', 
    'frequency': 'Frequency', 
    'monetary': 'Monetary', 
}
cluster_label = {'cluster': 'Cluster'}

In [None]:
labels_for_polar={**cluster_label, **base_labels, **metric_labels}

In [None]:
fig = df_processed.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='cluster'
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig, 'cluster all')
fig.show()

In [None]:
df_processed.analysis.segment_table(
    metrics=selected_metrics
    , dimension='cluster'
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**

- Most customers fall in Cluster 1 (53%) and Cluster 2 (39%).
- Cluster 1: No standout metrics
- Cluster 2: Highest Recency
- Cluster 3: Highest Monetary, Cluster 4: Highest Avg Unique Products, Cluster 5: Highest Frequency

---

**Number of Customers by Clusters in Different Segments**

Add dimensions to the dataframe with clusters.

In [None]:
df_processed = (
    df_processed.merge(df_customers[['customer_unique_id', *customers_dim]], on='customer_unique_id', how='left')
)

In [None]:
df_processed.viz.update_plotly_settings(
    labels={**base_labels, 'cluster': 'Cluster'}
)

In [None]:
pb.configure(
    df = df_processed
    , metric = 'customer_unique_id'
    , metric_label = 'Share of Customers'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'    
    , text_auto='.1%'
    , plotly_kwargs= {'category_orders': {'cluster': list(range(1, 6))}}
)

**By Activity Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'activity_segment' 
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Clusters 1-4 are dominated by one-time purchasers.
- Cluster 5 dominates all active segments except one-time purchases.
- One-time purchase rates are similar across all clusters except Cluster 5.

**By Purchase Amount Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'value_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**  

- Clusters 1-2: Mostly medium payment tier
- Cluster 3: Entirely high-value segment
- Clusters 1-2 are less common in high-value segment
- Clusters 1-2 dominate within each value segment

**By Loyalty Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'loyalty_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 4 has notably more critics
- Clusters 1-2 dominate across loyalty segments
- Fewer promoters in Cluster 4
- More neutrals in Cluster 5
- More critics in Cluster 4

**By Risk Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'risk_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Clusters 4-5 are more common among risky customers (with order cancellations)

**By Installment Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'installment_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Installments don't dominate Clusters 1-2
- Installments clearly dominate Clusters 3-5
- Highest installment rate in Cluster 3

**By Average Number of Products per Order Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'products_cnt_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 4 consists entirely of customers averaging >1 product per order

**By Average Order Weight Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'weight_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 3 has more heavy-weight orders
- Cluster 1 dominates light-weight orders

**By Top Payment Types**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_payment_types'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 1 has more debit card users

**By Top Product Categories**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_top_product_categories'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 4 dominates Bed Bath Table category

**By Top Generalized Product Categories**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_top_general_product_categories'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 3: Dominated by electronics
- Cluster 4: Dominated by furniture

**By Customer State**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_city'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 2 is more common in Rio de Janeiro
- Cluster 1 is less common in Rio de Janeiro

### 4.16.2 Customers with Multiple Purchases

#### 4.16.2.1 Cluster Definition

We will conduct a separate clustering of customers who have made more than one successful purchase. 

We will use the same metrics as in the clustering of all customers.

In [None]:
selected_metrics = [
    'avg_unique_products_cnt'
]

In [None]:
mask = df_customers.buys_cnt >= 2

In [None]:
cols_to_drop = ['recency_score', 'frequency_score', 'monetary_score', 'rfm_score', 'rfm_segment']
df_processed = (
    df_customers.loc[mask, ['customer_unique_id', *selected_metrics]]
    .merge(df_rfm[lambda x: x.rfm_segment=='Champions'], on='customer_unique_id', how='inner')
    .drop(cols_to_drop, axis=1)
    .set_index('customer_unique_id')
)

Check for missing values.

In [None]:
df_processed.isna().sum().nlargest(5)

There are no missing values.

In [None]:
scaler = StandardScaler()
df_processed = df_processed.dropna()
X_scaled = scaler.fit_transform(df_processed)

Determine the optimal number of clusters using the silhouette method.

In [None]:
silhouette_scores = []
for n_clusters in range(2, 15):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)
px.line(
    x=range(2, 15)
    , y=silhouette_scores
    , labels={'x': 'Number of clusters', 'y': 'Silhouette Score'}
    , title='Silhouette Score Method'
    , width=600
    , height=400
)

**Key Observations:**

- Global peak at 7 clusters
- Sharp drop after k=7 suggests overfitting

Examine the dendrogram of hierarchical clustering.

In [None]:
linked = linkage(X_scaled, method='ward')
fig = ff.create_dendrogram(
    linked
    , orientation='bottom'
)
fig.update_layout(
    title='Dendrogram of hierarchical clustering',
    xaxis_title='Observations',
    yaxis_title='Distance',
    width=800,
    height=500,
    margin=dict(l=50, r=50, b=50, t=50),
    yaxis_mirror=False,
)
fig.update_xaxes(showticklabels=False, ticks='', mirror=False)
fig.show(config=dict(displayModeBar=False), renderer="png")

**Key Observations:**

- Optimal cut at 6-7 clusters (where branches lengthen)
- We'll choose 7 clusters

In [None]:
optimal_clusters = 7
model = AgglomerativeClustering(n_clusters=optimal_clusters)
cluster_labels = model.fit_predict(X_scaled)

Evaluate quality.

In [None]:
score = silhouette_score(X_scaled, cluster_labels)
print(f'Silhouette Score: {score:.3f}')

**Key Observations:**

- Silhouette score of 0.41 indicates good clustering
- Clusters remain distinguishable despite score <0.5

Add cluster labels to the dataframe.

In [None]:
df_processed['cluster'] = cluster_labels + 1

#### 4.16.2.2 Cluster Analysis

Analyze the resulting clusters.

In [None]:
df_processed = df_processed.reset_index()

**Distribution by Clustering Metrics**

In [None]:
selected_metrics = [
    'avg_unique_products_cnt',
    'recency', 
    'frequency',
    'monetary',
]

Provide more readable names for the metrics on the graphs.

In [None]:
metric_labels = {
    'avg_unique_products_cnt': 'Avg Unique Products',
    'recency': 'Recency', 
    'frequency': 'Frequency', 
    'monetary': 'Monetary', 
}
cluster_label = {'cluster': 'Cluster'}

In [None]:
labels_for_polar={**cluster_label, **base_labels, **metric_labels}

In [None]:
fig = df_processed.analysis.segment_polar(
    metrics=selected_metrics
    , dimension='cluster'
    , count_column='customer_unique_id'
    , labels=labels_for_polar
)
pb.to_slide(fig, 'cluster repeat')
fig.show()

In [None]:
df_processed.analysis.segment_table(
    metrics=selected_metrics
    , dimension='cluster'
    , count_column='customer_unique_id' 
)
fig.show()

**Key Observations:**

- Most customers in Cluster 4 (43%) and Cluster 3 (34%)
- Cluster 1: No standout metrics
- Cluster 2: Highest Monetary
- Cluster 6: High Frequency and Monetary
- Cluster 7: High avg unique products

---

**Number of Customers by Clusters in Different Segments**

Add dimensions to the dataframe with clusters.

In [None]:
df_processed = (
    df_processed.merge(df_customers[['customer_unique_id', *customers_dim]], on='customer_unique_id', how='left')
)

In [None]:
df_processed.viz.update_plotly_settings(
    labels={**base_labels, 'cluster': 'Cluster'}
)

In [None]:
pb.configure(
    df = df_processed
    , metric = 'customer_unique_id'
    , metric_label = 'Share of Customers'
    , agg_func = 'nunique'
    , norm_by='all'
    , axis_sort_order='descending'    
    , text_auto='.1%'
)

**By Activity Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'activity_segment' 
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely core audience
- Core also dominates Cluster 1
- Cluster 4 dominates potential core segment

**By Purchase Amount Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'value_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Clusters 2 and 6: Entirely high-value segment
- Cluster 4 dominates medium-value segment

**By Loyalty Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'loyalty_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely promoters
- Cluster 4 dominates critics
- Clusters 4-5 more common among critics

**By Installment Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'installment_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely installment users
- Cluster 7 has more full-payment users

**By Average Number of Products per Order Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'products_cnt_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely single-product orders
- Cluster 7: No single-product orders
- Clusters 3-4 dominate single-product orders
- Clusters 2 and 7 dominate bulk orders (>2 products)
- Cluster 5 has more multi-product orders

**By Average Order Weight Segment**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'weight_segment'
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely light-weight orders
- Cluster 2: Heavy-weight orders dominate

**By Top Payment Types**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_payment_types'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely credit card users
- Cluster 7 has more boleto users

**By Top Product Categories**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_top_product_categories'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Clusters 1-2 dominate Watches Gifts category
- Cluster 7 dominates Bed Bath Table

**By Top Generalized Product Categories**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_top_general_product_categories'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 2 strongly dominates electronics
- Cluster 7 dominates furniture and home/garden

**By Customer State**

In [None]:
pb.cat_compare(
    cat1='cluster'
    , cat2 = 'customer_city'
    , trim_top_n_cat2=5
    , visible_graphs = [2, 3]
)

**Key Observations:**

- Cluster 6: Entirely São Paulo customers (top 5 cities)
- Cluster 1 more common in Niterói
- Cluster 3 more common in Campinas

<h2 id="4-17"> 4.17 Hypothesis Testing</h2>

As we previously determined, the following metrics have a right-skewed distribution:

- from_purchase_to_approved_hours
- total_payment
- delivery_time_days
- total_weight_kg
- avg_products_price
- total_freight_value

Given the skewness of these metrics and the fact that most of our hypotheses will involve more than two categories, we will use regression analysis to test our hypotheses. 

Regression analysis utilizes the full variance of the data when building a model.

To account for the right-skewness, we will employ a Generalized Linear Model (GLM) regression with a Gamma distribution and a log-link function.

For multiple comparisons, we will apply the Holm correction.

### 4.17.1 Time Patterns

**Are orders processed longer at night?**

For each time of day N (except Sunday):

- $H_{0}^{N}:$ The mean order processing time at night equals the mean order processing time during time period N.
- $H_{1}^{N}:$ The mean order processing time at night does not equal the mean order processing time during time period N.

In [None]:
df_sales.assign(from_purchase_to_approved_hours = lambda x: x.from_purchase_to_approved_hours + 1e-6).stats.glm(
    formula='from_purchase_to_approved_hours ~ C(purchase_time_of_day, Treatment(reference="Night"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
    , p_adjust='holm'
)

**Result:**

- At the 0.05 significance level, the mean order processing time at night is statistically significantly different from the mean order processing time at any other time of day.
- Moreover, the processing time at night is statistically significantly longer than during other periods.

---

**Are orders processed longer on weekdays?**

- H0: The average processing time for orders on weekdays and weekends is the same.
- H1: The average processing time for orders on weekdays and weekends differs

In [None]:
df_sales.assign(from_purchase_to_approved_hours = lambda x: x.from_purchase_to_approved_hours + 1e-6).stats.glm(
    formula='from_purchase_to_approved_hours ~ C(purchase_day_type, Treatment(reference="Weekday"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean order processing time on weekdays is statistically significantly different from the mean order processing time on weekends.
- Moreover, the processing time on weekdays is statistically significantly shorter than on weekends.

### 4.17.2 Customer Reviews Scores

**Does reviews score of 1 have a higher average order value?**

For each rating N (except 1):

- $H_{0}^{N}:$ The mean order value for rating 1 equals the mean order value for rating N.
- $H_{1}^{N}:$ The mean order value for rating 1 does not equal the mean order value for rating N.

In [None]:
df_sales.stats.glm(
    formula='total_payment ~ C(order_avg_reviews_score, Treatment(reference=1))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
    , p_adjust='holm'
)

**Result:**

- At the 0.05 significance level, the mean order value for rating 1 is statistically significantly different from the mean order value for any other rating.
- Moreover, the mean order value for rating 1 is statistically significantly higher than for other ratings.

---

**Are orders with review score of 1 delivered longer?**

For each rating N (except 1):

- $H_{0}^{N}:$ The mean delivery time for rating 1 equals the mean delivery time for rating N.
- $H_{1}^{N}:$ The mean delivery time for rating 1 does not equal the mean delivery time for rating N.

In [None]:
df_sales.stats.glm(
    formula='delivery_time_days ~ C(order_avg_reviews_score, Treatment(reference=1))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
    , p_adjust='holm'
)

**Result:**

- At the 0.05 significance level, the mean delivery time for rating 1 is statistically significantly different from the mean delivery time for any other rating.
- Moreover, the mean delivery time for rating 1 is statistically significantly longer than for other ratings.

### 4.17.3 Installments

**Are installment orders processed faster?**

- $H_{0}:$ The mean processing time of installment-based orders equals the mean processing time of non-installment orders.
- $H_{1}:$ The mean processing time of installment-based orders does not equal the mean processing time of non-installment orders.

In [None]:
df_sales.assign(from_purchase_to_approved_hours = lambda x: x.from_purchase_to_approved_hours + 1e-6).stats.glm(
    formula='from_purchase_to_approved_hours ~ C(order_has_installment, Treatment(reference="Has Installments"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean order processing time for installment payments is statistically significantly different from the mean order processing time for non-installment payments.
- Moreover, the mean order processing time for installment payments is statistically significantly shorter than for non-installment payments.

---

**Do installment orders have a higher average order value?**

- $H_{0}:$ The mean order value of installment-based orders equals the mean order value of non-installment orders.
- $H_{1}:$ The mean order value of installment-based orders does not equal the mean order value of non-installment orders.

In [None]:
df_sales.stats.glm(
    formula='total_payment ~ C(order_has_installment, Treatment(reference="Has Installments"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean order value for installment payments is statistically significantly different from the mean order value for non-installment payments.
- Moreover, the mean order value for installment payments is statistically significantly higher than for non-installment payments.

---

**Do installment orders have a higher average order weight?**

- $H_{0}:$ The mean weight of installment-based orders equals the mean weight of non-installment orders.
- $H_{1}:$ The mean weight of installment-based orders does not equal the mean weight of non-installment orders.

In [None]:
df_sales.assign(total_weight_kg = lambda x: x.total_weight_kg + 1e-6).stats.glm(
    formula='total_weight_kg ~ C(order_has_installment, Treatment(reference="Has Installments"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean order weight for installment payments is statistically significantly different from the mean order weight for non-installment payments.
- Moreover, the mean order weight for installment payments is statistically significantly higher than for non-installment payments.

---

**Do installment orders have a higher average product price in the order?**

- $H_{0}:$ The mean product price in orders with installment payments equals the mean product price in orders without installment payments.
- $H_{1}:$ The mean product price in orders with installment payments does not equal the mean product price in orders without installment payments.

In [None]:
df_sales.stats.glm(
    formula='avg_products_price ~ C(order_has_installment, Treatment(reference="Has Installments"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean product price in orders with installment payments is statistically significantly different from the mean product price in orders without installment payments.
- Moreover, the mean product price in orders with installment payments is statistically significantly higher than in orders without installment payments.

---

**Do installment orders have a higher average delivery cost?**

- $H_{0}:$ The mean delivery cost for orders with installment payments equals the mean delivery cost for orders without installment payments.
- $H_{1}:$ The mean delivery cost for orders with installment payments does not equal the mean delivery cost for orders without installment payments.

In [None]:
df_sales.assign(total_freight_value = lambda x: x.total_freight_value + 1e-6).stats.glm(
    formula='total_freight_value ~ C(order_has_installment, Treatment(reference="Has Installments"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean delivery cost for orders with installment payments is statistically significantly different from the mean delivery cost for orders without installment payments.
- Moreover, the mean delivery cost for orders with installment payments is statistically significantly higher than for orders without installment payments.

### 4.17.4 Order Processing and Delivery

**Do delayed orders have a higher average order value?**

- $H_{0}:$ The mean value of delayed orders equals the mean value of non-delayed orders.
- $H_{1}:$ The mean value of delayed orders does not equal the mean value of non-delayed orders.

In [None]:
df_sales.stats.glm(
    formula='total_payment ~ C(is_delayed, Treatment(reference="Delayed"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean value of delayed orders is statistically significantly different from the mean value of non-delayed orders.
- Moreover, the mean value of delayed orders is statistically significantly higher than of non-delayed orders.

---

**Do delayed orders have a higher average order weight?**

- $H_{0}:$ The mean weight of delayed orders equals the mean weight of non-delayed orders.
- $H_{1}:$ The mean weight of delayed orders does not equal the mean weight of non-delayed orders.

In [None]:
df_sales.assign(total_weight_kg = lambda x: x.total_weight_kg + 1e-6).stats.glm(
    formula='total_weight_kg ~ C(is_delayed, Treatment(reference="Delayed"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean weight of delayed orders is statistically significantly different from the mean weight of non-delayed orders.
- Moreover, the mean weight of delayed orders is statistically significantly higher than that of non-delayed orders.

---

**Do delayed orders have a higher average product price in the order?**

- $H_{0}:$ The mean product price in delayed orders equals the mean product price in non-delayed orders.
- $H_{1}:$ The mean product price in delayed orders does not equal the mean product price in non-delayed orders.

In [None]:
df_sales.stats.glm(
    formula='avg_products_price ~ C(is_delayed, Treatment(reference="Delayed"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean product price in delayed orders is statistically significantly different from the mean product price in non-delayed orders.
- Moreover, the mean product price in delayed orders is statistically significantly higher than in non-delayed orders.

---

**Do delayed orders have a higher average delivery cost?**

- $H_{0}:$ The mean delivery cost for delayed orders equals the mean delivery cost for non-delayed orders.
- $H_{1}:$ The mean delivery cost for delayed orders does not equal the mean delivery cost for non-delayed orders.

In [None]:
df_sales.assign(total_freight_value = lambda x: x.total_freight_value + 1e-6).stats.glm(
    formula='total_freight_value ~ C(is_delayed, Treatment(reference="Delayed"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
)

**Result:**

- At the 0.05 significance level, the mean delivery cost in delayed orders is statistically significantly different from the mean delivery cost in non-delayed orders.
- Moreover, the mean delivery cost in delayed orders is statistically significantly higher than in non-delayed orders.

---

**Does delivery delay affect the distribution of order ratings?**

- $H_{0}:$ The distribution of order ratings is identical for delayed and non-delayed orders.
- $H_{1}:$ The distribution of order ratings differs between delayed and non-delayed orders.

We will use:

- OrderedModel, since the rating is an ordinal variable.

In [None]:
(
    df_sales.assign(
        order_avg_reviews_score=lambda x: pd.Categorical(
            x.order_avg_reviews_score,
            categories=[1, 2, 3, 4, 5],
            ordered=True
        )
    )
    .stats.ordered_model(
        formula = 'order_avg_reviews_score ~ C(is_delayed, Treatment(reference="Delayed"))'
    )
)

**Result:**

- At the 0.05 significance level, the distribution of order ratings differs between delayed and non-delayed orders.
- Moreover, non-delayed orders have statistically significantly higher chances of receiving a higher rating.

---

**Are expensive orders delivered longer?**

For each order price category N (except expensive):

- $H_{0}^{N}:$ The mean delivery cost for the expensive price category equals the mean delivery cost for category N.
- $H_{1}^{N}:$ The mean delivery cost for the expensive price category does not equal the mean delivery cost for category N.

In [None]:
df_sales.stats.glm(
    formula='delivery_time_days ~ C(order_total_payment_cat, Treatment(reference="Expensive"))'
    , family=sm.families.Gamma(link=sm.families.links.Log())
    , p_adjust='holm'
)

**Result:**

- At the 0.05 significance level, the mean delivery cost for the high-price order category is statistically significantly different from the mean delivery cost for any other category.
- Moreover, the mean delivery cost for the high-price category is statistically significantly higher than for other categories.

<h1 id="5"> 5 General Conclusion</h1>

<h2 id="5-1"> 5.1 Time Dynamics</h2>

**Order and Sales Comparison**

- Conversion to successful sales shows consistent month-over-month growth.

---

**Canceled Order Rate**

- Monthly canceled order rate fluctuates between 0.2% and 1.2%.
- Orders with rating 1 have significantly higher cancellation rates. Ratings 4 and 5 show 0% cancellation for most months.

---

**Sales Volume**

- November 24, 2017 (Black Friday) was an anomalous peak sales day.
- Monthly sales grew until 2018, then stabilized at 6-7 thousand orders per month.
- Significant sales drops (>5% MoM) occurred in April, June, and December 2017, and February and June 2018.
- Rating 5 orders consistently dominate monthly volumes.
- Rating 2 orders are consistently the least common.
- Rating 5 orders continue growing in 2018 despite overall stagnation, correlated with declining rating 1 orders.
- Delayed orders don't grow proportionally with total orders.
- Peak delayed order months: November 2017 (Black Friday) and March 2018.
- November 2017 saw a sharp increase in long-delivery orders (Black Friday effect), persisting until March 2018 before declining.
- By August 2018, fast/medium delivery orders outnumbered long-delivery ones.
- Installment orders initially led but equalized with non-installment by 2018.
- São Paulo state maintains dominant sales volume without 2018 declines seen elsewhere.
- Rio de Janeiro and Minas Gerais typically rank second and third by sales volume.
- São Paulo city leads sales volume, with Rio de Janeiro second.
- São Paulo city shows 2018 monthly sales growth unlike other cities.

---

**Sales Value**

- Black Friday 2017 was the peak sales value day.
- Monthly sales value grew until 2018, stabilizing at 1-1.2 million R$.
- Significant value drops (>5% MoM) occurred in April, June, and December 2017, and February and June 2018.
- Rating 5 orders dominate total sales value.
- Rating 2 orders show lowest sales value.
- Black Friday's value spike was most pronounced for ratings 5 and 1.
- Rating 5 order values grow in 2018 despite overall stagnation, partly due to rating 1 value decline.
- Peak delayed order value months: November 2017 and March 2018.
- Long-delivery order values spiked November 2017-April 2018, beyond Black Friday explanation.
- Installment orders consistently show higher total value.
- São Paulo state leads sales value, followed by Rio de Janeiro and Minas Gerais.
- São Paulo city leads sales value, with Rio de Janeiro second.
- Credit card payments dominate sales value, with boleto second.
- Debit card payment values grew from June 2018.
- Electronics lead category values, followed by furniture.
- 'Beauty/Health' and 'Home/Garden' categories show 2018 growth while others stagnate/decline.

---

**Average Order Value (AOV)**

- Daily AOV fluctuates between 100-250 R$.
- No Black Friday AOV spike.
- Monthly AOV fluctuates between 150-170 R$ without growth.
- Significant AOV drops (>5% MoM) in May, July, and November 2017, and August 2018.
- June 2017 saw a major delayed order AOV spike, with smaller peaks in April/December 2017 and January/June 2018.
- Premium products typically have longer delivery times.
- Installment orders consistently show higher AOV (expected due to affordability).
- Installment AOV declined until July 2017, then grew with fluctuations.
- Rating 1 orders typically have higher AOV.
- Rating 2 orders often show higher AOV than ratings 3-5, suggesting expensive orders receive lower ratings.

---

**ARPPU**

- ARPPU fluctuates daily between 100 and 250 R$.
- There is no noticeable spike in ARPPU on Black Friday.
- ARPPU does not grow over the months but fluctuates between 130 and 150 R$.
- For most months, orders with a review rating of 1 have a higher ARPPU.
- Additionally, orders with a review rating of 2 have a higher ARPPU than those with ratings 3, 4, and 5 in the majority of months.
- In June 2017, there was a strong peak in ARPPU for delayed orders. Smaller peaks were observed in April and December 2017, January, and June 2018.

---

**Number of Customers**

- Black Friday 2017 was the peak customer count day.
- Customer growth continued until 2018, then stabilized at 6-7 thousand monthly.
- Significant customer drops (>5% MoM) occurred in April, June, and December 2017, and February and June 2018.
- Nighttime typically has fewer customers than other periods.
- Evening shows highest customer activity.
- São Paulo state leads customer count without 2018 declines seen elsewhere.
- Rio de Janeiro and Minas Gerais typically rank second and third in customer count.
- São Paulo city leads customer count, with Rio de Janeiro second.
- São Paulo city shows 2018 monthly customer growth unlike other cities.

---

**New Customer Rate**

- Daily new customer rate fluctuates significantly but never drops below 0.92.
- Monthly rate consistently declines but remains above 97% (nearly all active customers are new).
- Weekend rates fluctuate more than weekdays, typically lower.
- No significant state-level differences in new customer rates.
- Rio Grande do Sul shows wider fluctuations.
- São Paulo and Rio de Janeiro cities show more stable new customer rates than other top cities.

---

**Number of Sellers**

- Black Friday 2017 was the peak seller count day.
- Monthly seller count shows consistent growth.
- Weekday seller growth outpaces weekends.
- São Paulo state leads seller count with strongest growth.
- Paraná and Minas Gerais typically rank second and third.
- São Paulo city leads seller count, with Curitiba second.

---

**New Seller Rate**

- New seller rate declined until June 2017, then stabilized at 0.1-0.2.
- Weekend new seller rates are consistently lower than weekdays.

---

**Seller-to-Customer Ratio**

- Customer growth outpaced seller growth until November 2017, then reversed.
- New customer growth led until July 2017, then paralleled seller growth.

---

**Orders per Customer**

- Average orders per customer remains stable near 1 throughout.

---

**Number of Reviews**

- Daily review counts fluctuate dramatically, suggesting batch processing.
- Reviews grew until 2018, then stabilized at 6-8 thousand monthly.
- Fewer reviews created on Mondays and Sundays.
- Rating 5 reviews dominate monthly volumes.
- Rating 2 reviews are least common.
- Rating 1 reviews outnumber ratings 2 and 3.

---

**Review Ratings**

- Daily average ratings fluctuate between 3-5.
- Monthly averages declined August 2017-March 2018, then spiked.
- Weekday ratings are consistently higher than weekends.

---

**NPS**

- NPS remains satisfactory (0-49) throughout - many neutrals, few critics.
- Significant NPS drop occurred March 2018.

---

**Shipping Cost Ratio**

- Shipping cost as order value percentage remains stable at 0.2-0.22.
- Light products show higher ratios than medium/heavy.
- Installment orders show lower ratios throughout.
- São Paulo state shows lowest ratios among top states.
- São Paulo city shows lowest ratios among top cities.

---

**Delivery Time**

- Daily average delivery fluctuates between 5-20 days.
- Delivery times grew August 2017-February 2018, then dropped sharply to ~8 days.
- Lower ratings typically correlate with longer deliveries.
- São Paulo state shows fastest deliveries among top states.
- Rio de Janeiro and Rio Grande do Sul show longest delays.
- Rio de Janeiro city showed strongest delivery time growth October 2017-February 2018.

---

**Delivery Delay**

- Orders consistently deliver faster than estimates.
- Early deliveries improved until March 2017 (~12 days early), then fluctuated.
- June 2018 saw ~20 day early deliveries.

---

**Carrier Handover Delay**

- Orders consistently reach carriers faster than limits.
- Early handovers peaked at ~4.5 days in May 2017, declining to ~2.5 days by August.

---

**Delivery Stage Breakdown**

- Carrier delivery stage consumes most time throughout.

---

**Order Weight**

- Average weight declined from 2.8kg to 1.9kg.
- Rating 1 orders are typically heavier.
- Rating 2 orders show most monthly weight variation.
- Delayed orders are typically heavier.
- Installment orders are consistently heavier.
- Lighter orders typically deliver faster.

---

**Products per Order**

- Monthly average remains stable at 1.12-1.16.
- Ratings 1-2 typically have more products.
- Ratings 4-5 typically have fewest products.
- Delayed orders show stronger fluctuation.
- Installment orders typically have more products.

---

**Product Price**

- No Black Friday price spikes.
- Monthly averages fluctuate between 115-135 R$.
- Shows clear seasonality with alternating growth/decline periods.
- Rating 1 products are typically most expensive.
- Rating 3 products are typically least expensive.
- Delayed orders typically have higher prices.
- June 2017 saw delayed order price spike.
- Installment products are significantly more expensive.

---

**Sellers per Order**

- Average grows insignificantly from 1 to 1.02.
- Ratings 1-2 typically involve more sellers.
- Ratings 4-5 involve fewest sellers.
- April 2018 saw spike in seller count for undelivered orders.
- Installment orders involve more sellers.

---

**Categories per Order**

- Monthly average remains stable.
- Ratings 1-2 typically involve more categories.

<h2 id="5-2"> 5.2 Customers</h2>

### 5.2.1 Typical Customer Profile

**Daily Activity**:
- Typical daily customers: 100-215
- 5% of days had ≤45 customers; 5% had ≥291 customers

**Purchase Behavior**:
- 97% made only 1 successful purchase
- Only 3% made >1 purchase
- 0.24% made ≥3 purchases
- 0.05% made ≥4 purchases
- 0.02% made ≥5 purchases
- 0.01% made ≥6 purchases
- 30% made repeat purchases within 24h (likely multiple orders in one session)

**Financial Metrics**:
- 75% have lifetime spend <185 R$
- 5% have lifetime spend ≥470 R$
- 75% have average shipping cost ≤24 R$
- 5% have average shipping cost ≥54 R$

**Order Characteristics**:
- 87% average 1 product per order
- 1% average ≥3 products per order
- 75% have average order weight ≤2kg
- 5% have average order weight ≥10kg

**Delivery Experience**:
- 75% experience average delivery time ≤16 days
- 5% experience ≥29 days delivery
- Median delivery acceleration: 6-16 days
- 5% experience ≥4 days delivery delay

**Customer Retention**:
- Median time between purchases: ≤125 days (75% of customers)
- Maximum consecutive months purchasing: 6 months (1 customer)
- Customers with consecutive month purchases:
- 3 months: 8 customers
- 2 months: 438 customers

### 5.2.2 RFM

**Top Segments**:
- High frequency + recent (fr33): Highest spenders
- High frequency + monetary (fm33): Most customers with r3 (frequent, recent, high-value)

**Loyalty Distribution**:
- Hibernating: 22%
- About to Sleep: 18% 
- Promising: 14%
- Champions: 4%
- Loyal: 7%
- Lost: 4%

### 5.2.3 Customer Segmentation

**By Activity Segment**

- 3% (of all customers) did not make a single successful purchase.
- 94% of successful customers made only one purchase.
- 1% each in the Potential Core Audience and Short-Lived Repeat segments.
- The Core Audience accounts for less than 1%.
- The highest metric values are in the Core segment. Potential Core ranks second.
- The median average rating is higher among those who made only one purchase.
- Delivery time outperformance is highest in the Core segment and lowest in the One-Time segment.

---

**By Purchase Amount Segment**

- 49% of customers fall into the medium purchase amount segment. 24% each in the high and low segments.
- The high purchase amount segment spends the most. This is logical.
- The high purchase amount segment has a higher median value for average order weight.

---

**By Purchase Frequency Segment**

- Among those who made repeat purchases, the most common frequency is once a week (1%).
- Segments with quarterly or semi-annual purchases have higher metric values than other segments.

---

**By Time to Repeat Purchase Segment**

- In terms of time to repeat purchase, among those who made repeat purchases, the medium segment is the smallest (<1%).
- The segment with quick repeat purchases has worse metric performance than segments with medium or long times to the second purchase.

---

**By Loyalty Segment**

- Promoters dominate the loyalty segment (58%). Critics are the smallest group (13%).
- Critics (review score below 3) have higher total payment amounts and average order values than promoters and neutrals.
- Critics have a very low time between first and last purchase, meaning they rarely return.
- Critics, on average, purchase heavier orders.
- Critics also have the worst median delivery time outperformance, meaning their deliveries take longer.

---

**By Risk Segment**

- 99.5% of customers are in the reliable category, meaning they did not cancel orders.
- The segment that canceled at least one order has a much lower time between first and second purchases compared to the segment without cancellations.
- However, the segment with order cancellations has higher median values for total purchase amounts and average order values.

---

**By Day of Week Segment**

- 75% of customers made purchases only on weekdays.
- The segment that made purchases not only on weekends has a significantly higher time between first and last purchases.

---

**By Installment Segment**

- 50% of customers used installments at least once. 47% always paid in full upfront.
- The installment segment has significantly higher median values for total purchase amounts, average order values, order weight, and time between first and lastpurchases.

---

**By Average Number of Products per Order Segment**

- 88% of customers had no more than one product per order.
- 8% had an average of 1 to 2 products.
- Only 2% of customers had an average of more than 2 products per order.
- The segment with 2 or more products per order has significantly higher values for average order weight, total purchase amounts, and average order values. They alsohave a longer time between first and last purchases.

---

**By Average Order Weight Segment**

- 39% of customers purchased lightweight products on average. 37% - medium. 21% - heavy.
- The segment with heavy orders has significantly higher total purchase amounts and average order values.
- The lightweight segment has a shorter time between first and last purchases compared to the medium and heavy segments.

---

**By Top Days of Week**

- Most customers made purchases on only one day of the week. This is logical, given the low number of repeat purchases.
- The top 3 days by customer count: Monday, Tuesday, and Wednesday.
- Interestingly, customers who made purchases only on Monday have a longer time between first and last purchases than others in the top 5 segments. But this is likelycoincidental.

---

**By Top Payment Types**

- The most common payment method was credit card only (73%). Boleto ranks second (19%).
- The segment that used only vouchers has lower total purchase amounts and average order values than other segments in the top 5 payment types.

---

**By Top Product Categories**

- The largest number of customers purchased only from the Bed Bath Table and Health Beauty categories.
- Customers who purchased only sports goods have a longer time between first and last purchases than other segments by product category.

---

**By Top Generalized Product Categories**

- Among the top 5 segments by generalized product category, the largest share of customers purchased only from 
  - Electronics (26%)
  - Furniture (17%)
  - Home & Garden (14%).

---

**By Customer State**

- 42% of customers are from São Paulo. 
- 13% - Rio de Janeiro. 
- 12% - Minas Gerais.

---

**By Customer City**

- 16% of customers are from São Paulo. 
- 7% - Rio de Janeiro.

---

### 5.2.4 Customer Profiling

**By Purchase Frequency and Loyalty**

- One-Time Buyers (94%):
    - Make only one purchase.
    - Low engagement (short time between purchases).
- Potential Core (1%):
    - Potentially loyal but not yet part of the core.
    - High metrics (second only to Core).
- Core (<1%):
    - Core audience: most loyal, high spending, best metrics.
- Short-Lived Repeat (1%):
    - Short-term loyalty.

Recommendations:
- Focus on converting One-Time buyers into Potential Core (loyalty programs, personalized offers).
- Retain Core through premium service.

---

**By Purchase Amount**

- High-Spend (24%):
    - Large order values, heavy orders.
    - Despite high spending, ratings are not above median (risk of losing critics).
- Medium-Spend (49%):
    - The majority, "stable" customers.
- Low-Spend (24%):
    - Small order values, possibly trial purchases.

Recommendations:

- For High-Spend: Improve delivery (they have low time outperformance) and service to reduce critic share.
- For Low-Spend: Encourage higher order values (discounts on additional items).

---

**By Time Between Purchases**

- Quick Repeat Purchases (<1%):
    - Low metrics (possibly impulse purchases or dissatisfaction).
- Quarterly/Semi-Annual (1%):
    - High metrics—"planned" purchases (e.g., seasonal items).

Recommendations:

- For the quick repeat segment: Improve post-purchase experience (to prevent loss after the first purchase).
- For "quarterly" buyers: Target reminders before seasons.

---

**By Loyalty**

- Promoters (58%):
    - Satisfied but few repeat purchases (94% are one-time).
    - Paradox: high ratings but low retention.
- Critics (13%):
    - High order values but quick churn (short time between purchases).
    - Delivery issues (low delivery time outperformance).

Recommendations:

- Analyze reasons for low promoter retention (e.g., lack of motivation for repeat purchases).
- Address critics: Improve logistics and service (they are potentially valuable due to high order values).

---

**By Behavioral Patterns**

- Customers who made purchases not only on weekends (25%):
    - More loyal (longer time between purchases).
- Customers using installments (50%):
    - High order values, long lifecycle—"serious" customers.
- Customers with 2+ items per order (2%):
    - Key for profitability (high metrics).
- Customers who use only vouchers:
    - Have lower total purchase amounts and average order values.

Recommendations:

- Promote installments to increase average order value.
- Encourage multi-item orders (free shipping for N items).
- For voucher-paying customers, develop strategies to increase average order value.

---

**By Geography**

- São Paulo (42%) and Rio de Janeiro (13%):
    - Key markets.

Recommendations:

- Localized marketing campaigns (e.g., Furniture promotions in São Paulo).

### 5.2.5 Customer Clustering

#### 5.2.5.1 All Customers

**Cluster Distribution:**

- Cluster 1 (53%): Neutral characteristics (baseline group)
- Cluster 2 (39%): High Recency (recent purchases)
- Cluster 3 (2.5%): High Monetary Value
- Cluster 4 (3%): High number of unique products per order
- Cluster 5 (3%): High purchase frequency (Frequency)

---

**Key Features:**

- Cluster 1        
    - One-time purchases.
    - Medium purchase amount segment.
    - Dominates the value segment.
    - Dominates the loyalty segment.
    - Dominates lightweight orders.
    - Dominates the debit card segment.
- Cluster 2
    - One-time purchases.
    - Medium purchase amount segment.
    - Dominates the value segment.
    - Dominates the loyalty segment.
    - Dominates the state of Rio De Janeiro.
- Cluster 3
    - One-time purchases.
    - Entirely consists of the high-value segment.
    - Highest share of installment payments among all clusters.
    - Noticeably more heavy-weight orders than other clusters.
    - Dominates the generalized Electronics category.
- Cluster 4
    - One-time purchases.
    - Predominantly critics.
    - Lower share of promoters compared to other clusters.
    - Dominates the high-risk segment (customers who canceled at least one order).
    - Dominates installment payments.
    - Entirely consists of customers with an average of more than one product per order.
    - Dominates the Bed Bath Table product category.
    - Dominates the generalized Furniture category.
- Cluster 5
    - Predominantly repeat buyers.
    - Higher share of neutral loyalty segment than other clusters.
    - Dominates the high-risk segment (customers who canceled at least one order).
    - Dominates installment payments.

#### 5.2.5.2 Customers with Multiple Purchases

**Cluster Distribution:**

- Cluster 4 (43%): Stable buyers
- Cluster 3 (34%): Average across all metrics
- Cluster 5 (9%): High share of critics
- Cluster 1 (8%): High Recency (recent purchases)
- Cluster 2 (4%): High Monetary Value
- Cluster 7 (3%): Diverse product buyers
- Cluster 6 (0.1%): Core Audience (Frequency + Monetary)

---

**Key Features:**

- Cluster 1
    - Dominates the core audience.
    - Dominates the "watches gifts" category.
- Cluster 2
    - Entirely consists of the high-value segment (purchase amount).
    - Dominates the "bulk" segment (average of more than 2 products per order).
    - Dominates heavy-weight orders.
    - Dominates the "watches gifts" category.
    - Dominates the generalized Electronics category.
- Cluster 3
    - Dominates orders with a single product.
- Cluster 4
    - Dominates the Potential Core segment.
    - Dominates the medium-value segment.
    - Dominates critics.
    - Dominates orders with a single product.
- Cluster 5
    - Dominates critics.
    - Higher share of "multi" segment (1-2 products per order).
- Cluster 6
    - Entirely consists of the core audience.
    - Entirely consists of the high-value segment (purchase amount).
    - Entirely consists of promoters.
    - Entirely consists of installment users.
    - Entirely consists of orders with a single product.
    - Entirely consists of lightweight orders.
    - Entirely consists of credit card payments.
    - Entirely consists of customers from São Paulo (top 5 cities by sales volume).
- Cluster 7
    - Higher average number of unique products.
    - Higher share of full upfront payments compared to other clusters.
    - No single-product orders.
    - Dominates the "bulk" segment (average of more than 2 products per order).
    - Dominates "boleto" payment method.
    - Dominates the Bed Bath Table category.
    - Dominates the generalized Furniture and Home & Garden categories.

<h2 id="5-3"> 5.3 Sales</h2>

**Sales Volume and Revenue**

- On 75% of days, there were up to 215 orders. On 5% of days, there were 45 or fewer orders. On 5% of days, there were 293 or more orders. There were also several days with over 400 orders.
- On 75% of days, sales amounted to up to 33K R$. On 5% of days, sales were 6.7K R$ or less. On 5% of days, sales were 49K R$ or more. There were also several days with sales exceeding 70K R$.
- Most sales occurred in the evening (36% of all sales), while the fewest occurred at night (9%). Morning accounted for 23%, and afternoon for 32%.
- The fewest sales occurred on Saturday (11%), while the most occurred on Monday (16%).
- 77% of all orders were placed on weekdays.
- 59% of orders received a review rating of 5.
- The fewest orders received a rating of 2 (3%).
- Orders with a rating of 1 were more common than those with ratings of 2 and 3.
- 92% of orders were not delayed.
- 63% of orders fell into the medium price category.
- 46% of orders were in the medium weight category, while 40% were lightweight.
- 59% of orders had a medium delivery time category.
- 51% of orders used installment payments.
- 75% of orders were paid with a credit card, while 20% used "boleto."
- Top 3 product categories by order volume: Bed Bath Table (9%), Health Beauty (9%), and Sports Leisure (8%).
- Top 3 generalized categories by order volume: Electronics (27%), Furniture (18%), and Home & Garden (14%).
- 42% of sales were in São Paulo. 13% and 12% were in Rio de Janeiro and Minas Gerais, respectively. Other states accounted for 6% or less.
- The city of São Paulo accounted for 16% of all sales, while Rio de Janeiro accounted for 7%. Other cities accounted for 3% or less.
- The total value of installment orders was significantly higher than non-installment orders, despite similar order volumes. This means installment orders are more expensive, which is logical—installments allow customers to afford higher-value purchases.

---

**Order Value**

- 75% of orders had a value of up to 177 R$. 5% had a value of up to 33 R$. 5% had a value of 445 R$ or more. There were also many outliers exceeding 1000 R$.
- Non-delayed orders had lower average values than delayed orders.
- Installment orders had much higher average values—customers using installments can afford more expensive purchases.
- Although São Paulo had the most orders, it had the lowest average order value among top states.
- The highest average order value was in the state of Pará.
- The city of São Paulo had the most orders but not the highest order value. Rio de Janeiro ranked second in order volume but had high order values.
- Among top cities by sales volume, Salvador had the highest average order value.
- Orders with a rating of 1 had higher average values than others. Rating 2 ranked second. Conclusion: expensive orders received more 1 and 2 ratings.

---

**Order Ratings**

- 59% of orders received a rating of 5.
- Peaks in high and low ratings were observed at night, especially on Thursdays.
- Non-delayed orders had significantly higher average ratings than delayed orders.
- Non-delayed orders had a higher share of 5 ratings, while delayed orders had more 1 ratings.
- Faster deliveries correlated with higher ratings.
- The lowest average rating was in the state of Maranhão.
- Among top states by sales volume, Rio de Janeiro and Bahia had the highest share of 1 ratings.
- Among top cities by sales volume, Rio de Janeiro and Porto Alegre had noticeably higher shares of 1 ratings.

---

**Order Weight**

- 75% of orders weighed up to 2 kg. 5% weighed up to 150 grams. 5% weighed 10 kg or more.
- Delayed orders were heavier than non-delayed orders.
- Installment orders were heavier than non-installment orders.
- Among top states by sales volume, Mato Grosso had the heaviest average orders.
- Among top cities by sales volume, Santos and Rio de Janeiro had the heaviest orders. Rio de Janeiro ranked second in order volume.
- Orders with a rating of 1 were significantly heavier than others. Rating 2 ranked second. Conclusion: heavier orders received lower ratings.

---

**Number of Products per Order**

- 90% of orders consisted of a single product.
- There were 2 anomalous orders with 20 and 21 products.
- Non-delayed orders had slightly more products per order than delayed orders.
- Orders with ratings of 1 and 2 had higher average product counts than others.

---

**Product Price per Order**

- For 75% of orders, the average product price was up to 140 R$. 5% had an average product price of 363 R$ or higher.
- Delayed orders had higher average product prices.
- Installment orders had significantly higher average product prices.
- Among top states by sales volume, Pará had the highest average product price, while São Paulo had the lowest (despite having the most orders).
- Among top cities by sales volume, the top 3 by product price were Brasília, Rio de Janeiro, and Salvador.

---

**Number of Sellers per Order**

- 99% of orders had only one seller.

---

**Number of Categories per Order**

- 99% of orders had only one category.

<h2 id="5-4"> 5.4 Products</h2>

**Product Quantity**

- 75% of products sold 1-2 units over the entire period. 5% sold 10+ units.
- On 75% of days, up to 207 products were sold. There were several days with over 400 products sold.
- Most products sold were in categories: Bed Bath Table and Health Beauty.
- Fewest products sold were in categories: Security and Services.
- Top 3 generalized categories by products sold: Electronics (27%), Furniture (19%), Home & Garden (15%). Fewest (1%) in Food & Drinks.

---

**Product Price**

- 75% of sold products had an average price ≤153 R$. 5% were ≤17 R$, while another 5% were ≥470 R$.
- On 5% of days, average product price was ≤94 R$. 50% of days ranged 108-130 R$. 5% were ≥162 R$.
- Highest average price in Watches Gifts category. Lowest in Flowers.
- Top 3 generalized categories by average price: Industry & Construction, Electronics, Fashion. Lowest in Food & Drinks.

---

**Product Sales Revenue**

- 75% of days had product sales ≤29K R$. 5% had ≥42K R$.
- Highest revenue categories: Health Beauty and Watches Gifts.
- Lowest revenue in Security and Services.
- Top 3 generalized categories by revenue: Electronics, Furniture, Home & Garden. Lowest in Food & Drinks.

---

**Revenue per Product**

- 75% of products generated ≤325 R$ total revenue.
- Highest average revenue per product in Watches Gifts. Lowest in Flowers.
- Top 3 generalized categories by revenue per product: Electronics, Beauty & Health, Industry & Construction. Lowest in Books & Stationery.

---

**Price Changes**  

- 80% of products had no price changes. 5% changed by ≥20 R$.
- Most volatile prices in Watches Gifts.
- Top 3 generalized categories by price volatility: Electronics, Industry & Construction, Beauty & Health. Least volatile: Food & Drinks.

---

**Average Quantity per Order**

- 85% of products appeared as single units in orders.
- Highest average quantities in Food & Drinks.

---

**Product Characteristics**  

- 75% of products have a title length of up to 57 characters.
- 75% of products have a description length of up to 1000 characters.
- 52% of all products have only one photograph. 5% of products have 6 or more photographs.
- 75% of products weigh up to 1.9 kg. 5% have a weight of 11 kg or more.
- 75% of products have a length of up to 38 cm. 5% have a length of 65 cm or more.
- 75% of products have a width of up to 30 cm. 5% have a width of 47 cm or more.
- 75% of products have a height of up to 21 cm. 5% have a height of 44 cm or more.
- 75% of products have a volume of up to 19,000 cm³. 5% have a volume of more than 64,000 cm³.
- 75% of products have a weight-to-volume ratio of up to 0.2. 5% of products have a ratio of 0.5 or more.


---

**Unsold Products**

- 2.5% of all products were never sold.

<h2 id="5-5"> 5.5 Reviews</h2>

**Number of Reviews**

- Most common: 1 review per day.
- 75% of days had ≤270 reviews; 5% had ≥375.
- Fewest reviews on Mondays. Sundays had more than Mondays but still below other days (possibly a review registration peculiarity).
- Rating distribution:
    - 5 stars: 58%
    - 4 stars: 19%
    - 1 star: 12%
    - 3 stars: 8%
    - 2 stars: 3% (least common)

---

**Review Ratings**

- 5% of days had average rating <3.26; 5% >4.6. 50% ranged 3.9-4.3.
- Weekdays had slightly higher ratings than weekends.
- 5-star reviews were more frequent on weekdays; 1-star reviews more frequent on weekends.
- Sundays had the lowest average ratings.
- Sundays had disproportionately more 1-star and fewer 5-star ratings.

---

**Response Time**

- Bimodal distribution: peaks at ~1 day and ~3.5 days.
- 75% of reviews received responses within ≤3.1 days; 5% took ≥7 days.
- 5% of days had average response time ≥5.85 days.
- Slowest responses on Fridays; fastest on Mondays.

---

**Review Length**

- 75% of reviews had ≤100 characters.
- Lower ratings correlated with longer messages (negative reviews tend to be more detailed).

---

**NPS**

- Just over 5% of days had good NPS (>50). 5% had negative NPS (customer dissatisfaction).

---

**Review Titles**

- Most frequent words: "recommend", "excellent"
- ~10% of titles had negative sentiment.

---

**Review Text**

- Frequent mentions of delivery issues.
- Most common word: "product".
- ~15% of messages had negative sentiment.

<h2 id="5-6"> 5.6 Delivery</h2>

**Shipping Costs**  

- 75% of orders had shipping ≤24 R$; 5% ≥54.7 R$.
- Several extreme outliers with very high shipping costs.
- Delayed orders had higher shipping costs.
- Heavier orders cost more to ship (expected).
- Installment orders had higher shipping costs.
- Among top states:
    - Lowest average shipping: São Paulo
    - Highest: Maranhão
- Among top cities (highest average shipping): Salvador, Porto Alegre, Brasília

---

**Seller-Buyer Distance**

- 75% of orders: ≤800 km. 5% ≤16.5 km; 5% ≥2,000 km.
- Extreme outliers (>4,000 km).
- Delayed orders had greater average distances.
- Installment orders had greater average distances.

---

**Delivery Stage Breakdown**

- Payment processing: 4% of total time
- Handover to carrier: 25.5%
- Carrier delivery: 70.5% (longest stage)
- Significant differences between stages (non-overlapping IQRs).

---

**Total Delivery Time**

- Median: 10+ days. 75%: 16+ days. 5%: 30+ days.
- More expensive orders took longer.
- Heavy orders took longer than light/medium.
- 1-star rating orders had noticeably longer delivery.
- Among top states slowest states: Pará, Maranhão, Ceará.
- Among top cities slowest cities: Salvador, Porto Alegre, Rio de Janeiro.

---

**Delivery Delay**

- 75% of orders are delivered 6 or more days ahead of the estimated delivery time. Approximately 5% of orders are delayed by 4 or more days.
- The higher the order rating, the earlier it is delivered relative to the estimated delivery time.
- Among the top states with the highest number of sales, the top 3 states by delivery time ahead of the estimated time are Mato Grosso, Pará, and Rio Grande do Sul.

---

**Order Processing Time**

- 75% processed in ≥14 hours; 5% ≥48 hours.
- Slowest processing: Fridays/Saturdays. Fastest: Wednesdays.
- Nighttime processing was slower.
- Weekday processing was faster than weekends.
- Installment orders processed much faster.
- 1- and 2-star orders took longer to process.

---

**Carrier Handover Time**

- 75%: ≤3.5 days. 5%: ≥8 days.
- Slowest handovers: Friday/Saturday orders.
- Expensive/heavy orders took longer.
- Faster handover → higher ratings.

---

**Carrier Delivery Time**

- Median: 7+ days. 25%: 12+ days. 5%: 24+ days.
- Cheap/light items delivered faster.
- Longer carrier delivery → lower ratings.
- Among top states slowest states: Pará, Maranhão, Ceará.
- Among top cities slowest cities: Salvador, Porto Alegre, Rio de Janeiro.

---

**Carrier Handover Delays**

- 75% of orders were handed over ≥1.6 days early.
- Extreme early handover outliers (data anomalies).
- 5% delayed by ≥0.79 days; 1% by ≥7 days.
- Earlier handover → higher ratings.

<h2 id="5-7"> 5.7 Payments</h2>

**Number of payments**

- 75% of days had ≤230 payments; 5% ≥312. Several days exceeded 500.
- Payment method distribution:
    - Credit card: 74%
    - Boleto: 19%
    - Voucher: 5.5%
    - Debit card: 1.5%

---

**Payment Amounts**

- 75% ≤172 R$; 5% ≥440 R$. Outliers >6K R$.
- Highest average: credit card. Lowest: vouchers.

---

**Installment Breakdown**

- 51% of payments were single-installment.
- Some payments had 9+ installments.
- Credit card averaged 3.5 installments; other methods had none.

<h2 id="5-8"> 5.8 Sellers</h2>

- 75% of sellers sold ≤26 items total.
- 5% sold >150 items.
- 75% sold ≤10 unique items.
- 5% sold >45 unique items.
- 75% participated in ≤22 orders.
- 5% participated in ≥130 orders.
- 75% generated ≤3.5K R$ revenue. 5% ≥17K R$.
- 75% averaged 1.14 items per order; 1% averaged ≥3.
- 75% averaged ≤189 R$/order; 5% ≥641 R$.
- 75% averaged ≤174 R$/product; 5% ≥595 R$.
- 75% averaged ≤2.7 kg/product; 5% ≥11 kg.
- 5% delivered orders ≥6.5 days early.
- 75% delivered ≥2 days early; 5% delayed by ≥1 day.

<h2 id="5-9"> 5.9 Cohort Analysis</h2>

**Number of Sales**

- Most purchases occur in a cohort's first month.
- Stabilizes afterward without sharp declines.

---

**Revenue**

- Similar to purchase volume—most revenue generated in first month.
- No sharp lifetime declines.

---

**Average Order Value**

- No post-first-month decline (relative metric).
- No clear growth/decline trends across cohorts.
- Median AOV shows slight decline after 11 months.
- Anomalies:
    - March 2017 cohort (month 5)
    - April 2017 cohort (month 15)

---

**Number of Customers**  

- Mirrors revenue/purchase trends—few customers return after first month.

---

**Retention**

- Extremely low 1st+ month retention.
- Median retention confirms minimal repeat purchases.

---

**Avg Purchases per Customer**

- Mostly 1-1.3 purchases per period.

---

**ARPPU**

- Similar to AOV (most customers make only one purchase).

---

**LTV (Revenue-Based)**

- Minimal lifetime changes (most purchases occur in first month).

<h2 id="5-10"> 5.10 Correlation Analysis</h2>

**Sales**

- Moderate positive correlations:
    - Total delivery time vs carrier handover (0.4)
    - Total delivery time vs carrier delivery (0.6—stronger influence)
    - Total vs unique products per order (0.5)
    - Sellers per order vs unique products (0.6)
    - Sellers per order vs categories (0.6)
    - Shipping cost vs order value (0.5)
    - Shipping cost vs weight/volume (0.6)
    - Volume vs weight (0.8—strong)
- Key insights:
    - Carrier delivery impacts total time most
    - More unique products → more sellers
    - Heavier/larger orders cost more to ship
    - Distance increases delivery time/cost

---

**Customers**

- There is a high positive correlation (0.9) between the number of orders and the number of reviews. This means that customers who make more orders tend to leave more reviews.
- The more months a customer makes purchases, the more orders they place. This is logical.
- The higher the average number of unique products per order, the higher the average number of sellers. This indicates that customers with a greater variety of products in their orders tend to purchase from different sellers.

<h2 id="5-11"> 5.11 Black Friday</h2>

**Number of Sales**

- Sales growth began at 5 AM and continued until 10 AM. The first peak was reached at 10 AM. The second peak was at 1 PM. Sales then decreased until 6 PM.
- After 6 PM, sales growth resumed and reached the highest peak at 10 PM.
- A strange spike was observed at 0 AM on November 24. This might be due to the start of Black Friday, and some customers began making purchases early.
- 64% of orders have an average price.
- 52% of orders have an average weight. 36% are light.
- 49% of orders have a long delivery time category.
- 46% of orders have a medium delivery time category.
- Only 6% of orders have a fast delivery time category.
- This indicates that delivery times were significantly longer during Black Friday.
- 58% of orders have installment payments.
- 79% of orders were paid with a credit card. 18% were paid with boleto.
- The majority of orders (13%) consist of products from the category Bed Bath Table.
- The majority of orders (25%) consist of products from the generalized category Bed Bath Table, and 23% consist of products from the Furniture category.
- 49% of orders have a review rating of 5.
- The fewest orders have a review rating of 2 (4.3%).
- Orders with a review rating of 1 (17.5%) are more numerous than those with a review rating of 2 (4.3%) and 3 (10.7%).
- 37% of sales were in the state of São Paulo. 14% were in the states of Rio de Janeiro and Minas Gerais, respectively. Sales in other states were 6% or less.
- São Paulo city accounted for 14% of all sales. Rio de Janeiro city accounted for 8% of all sales. Sales in other cities were 3% or less.

---

**Sum of Sales**

- The pattern of sales by sum is slightly different from the number of sales. The main peak was reached from 10 to 11 AM. Subsequently, the sum of sales decreased until 6 PM. Then there was an increase, but the previous maximum peak was not exceeded.

---

**Order Value**  

- 75% of orders on Black Friday had a value of up to 170 R$. However, there were many outliers with values exceeding 1000 R$.
- The average order value had an anomalous peak at 4 AM on November 25, 2017.
- It cannot be said that the average order value was higher on Black Friday compared to neighboring days.
- The highest order value was observed in the states of Espírito Santo, Mato Grosso, and Minas Gerais.

---

**Number of Customers**

- The dynamics of the number of customers mirrors the dynamics of the number of sales. This means customers were making one order each, which is logical.

---

**Share of Orders with Status 'unavailable'**

- The share of orders with the status 'unavailable' has spikes at 00 and 10 AM on November 23, 2017, and at 3 AM on November 24, 2017. There is also a very strong spike on November 26, 2017.
- It cannot be said that there was a shortage of goods specifically on Black Friday.

---

**Average Order Rating**

- 49% of orders created on November 23, 24, and 25, 2017, have a review rating of 5.

<h2 id="5-12"> 5.12 Cancelled Orders</h2>

**Number of Orders**

- The number of cancelled orders per month generally fluctuates between 20 and 40 orders.
- An unusually high number of cancelled orders was observed in February and August 2018.
- 55% of cancelled orders have an average price. 28% have a high price.
- 36% of cancelled orders have an average weight. 31% are light.
- 51% of cancelled orders do not have installment payments.
- 70% of cancelled orders were paid with a credit card. 16% were paid with boleto. 10% were paid with a voucher.
- The share of vouchers in cancelled orders is significantly higher than in all orders.
- The top 3 product categories in cancelled orders: Bed Bath Table, Health Beauty, Sports Leisure.
- The top 3 generalized product categories in cancelled orders: Electronics, Furniture, and Home & Garden.
- 42% of cancelled orders were in the state of São Paulo. 13% and 12% were in the states of Rio de Janeiro and Minas Gerais, respectively. Sales in other states were 6% or less.

---

**Sum of Sales**

- The sum of cancelled orders per month generally fluctuates between 4k and 8k R$.
- An unusual spike in the sum of cancelled orders was observed in July and August 2018.

---

**Average Order Value**

- There were sharp spikes in the average order value of cancelled orders in April 2017 and July 2018.

<h2 id="5-13"> 5.13 Hypothesis Testing Results</h2>

### 5.13.1 Time Patterns

**Are orders processed longer at night?**

**Result:**

- At the 0.05 significance level, the mean order processing time at night is statistically significantly different from the mean order processing time at any other time of day.
- Moreover, the processing time at night is statistically significantly longer than during other periods.

---

**Are orders processed longer on weekdays?**

**Result:**

- At the 0.05 significance level, the mean order processing time on weekdays is statistically significantly different from the mean order processing time on weekends.
- Moreover, the processing time on weekdays is statistically significantly shorter than on weekends.

### 5.13.2 Review Scores

**Does reviews score of 1 have a higher average order value?**

**Result:**

- At the 0.05 significance level, the mean order value for rating 1 is statistically significantly different from the mean order value for any other rating.
- Moreover, the mean order value for rating 1 is statistically significantly higher than for other ratings.

---

**Are orders with review score of 1 delivered longer?**

**Result:**

- At the 0.05 significance level, the mean delivery time for rating 1 is statistically significantly different from the mean delivery time for any other rating.
- Moreover, the mean delivery time for rating 1 is statistically significantly longer than for other ratings.

### 5.13.3 Installment

**Are installment orders processed faster?**

**Result:**

- At the 0.05 significance level, the mean order processing time for installment payments is statistically significantly different from the mean order processing time for non-installment payments.
- Moreover, the mean order processing time for installment payments is statistically significantly shorter than for non-installment payments.

---

**Do installment orders have a higher average order value?**

**Result:**

- At the 0.05 significance level, the mean order value for installment payments is statistically significantly different from the mean order value for non-installment payments.
- Moreover, the mean order value for installment payments is statistically significantly higher than for non-installment payments.

---

**Do installment orders have a higher average order weight?**

**Result:**

- At the 0.05 significance level, the mean order weight for installment payments is statistically significantly different from the mean order weight for non-installment payments.
- Moreover, the mean order weight for installment payments is statistically significantly higher than for non-installment payments.

---

**Do installment orders have a higher average product price in the order?**

**Result:**

- At the 0.05 significance level, the mean product price in orders with installment payments is statistically significantly different from the mean product price in orders without installment payments.
- Moreover, the mean product price in orders with installment payments is statistically significantly higher than in orders without installment payments.

---

**Do installment orders have a higher average delivery cost?**

**Result:**

- At the 0.05 significance level, the mean delivery cost for orders with installment payments is statistically significantly different from the mean delivery cost for orders without installment payments.
- Moreover, the mean delivery cost for orders with installment payments is statistically significantly higher than for orders without installment payments.

### 5.13.4 Order Processing and Delivery

**Do delayed orders have a higher average order value?**

**Result:**

- At the 0.05 significance level, the mean value of delayed orders is statistically significantly different from the mean value of non-delayed orders.
- Moreover, the mean value of delayed orders is statistically significantly higher than of non-delayed orders.

---

**Do delayed orders have a higher average order weight?**

**Result:**

- At the 0.05 significance level, the mean weight of delayed orders is statistically significantly different from the mean weight of non-delayed orders.
- Moreover, the mean weight of delayed orders is statistically significantly higher than that of non-delayed orders.

---

**Do delayed orders have a higher average product price in the order?**

**Result:**

- At the 0.05 significance level, the mean product price in delayed orders is statistically significantly different from the mean product price in non-delayed orders.
- Moreover, the mean product price in delayed orders is statistically significantly higher than in non-delayed orders.

---

**Do delayed orders have a higher average delivery cost?**

**Result:**

- At the 0.05 significance level, the mean delivery cost in delayed orders is statistically significantly different from the mean delivery cost in non-delayed orders.
- Moreover, the mean delivery cost in delayed orders is statistically significantly higher than in non-delayed orders.

---

**Does delivery delay affect the distribution of order ratings?**

**Result:**

- At the 0.05 significance level, the distribution of order ratings differs between delayed and non-delayed orders.
- Moreover, non-delayed orders have statistically significantly higher chances of receiving a higher rating.

---

**Are expensive orders delivered longer?**

**Result:**

- At the 0.05 significance level, the mean delivery cost for the high-price order category is statistically significantly different from the mean delivery cost for any other category.
- Moreover, the mean delivery cost for the high-price category is statistically significantly higher than for other categories.

<h2 id="5-14"> 5.14 Detected Anomalies</h2>

**Duplicates**

- In the geolocation table, 26% of the rows are complete duplicates.
- The geolocation table contains 97% duplicate entries in the geolocation_zip_code_prefix column. This occurs because a single geolocation_zip_code_prefix can have multiple coordinates. This is not an anomaly but a characteristic of the data.
- There are 827 duplicate review_id entries in the reviews table. 
  - This means that one review was left for different orders. 
  - In March 2018, there was a significant spike in the number of such duplicates. 
  - The same review was left for different orders that have the same rating and review description but different products, prices, and other characteristics. 
  - It is possible that one review was left for several orders at once, or this could be an error in data collection.

---

**Missing Values**

- In February 2017 and August 2018, there was a spike in orders that do not have a payment approval time. The payment type voucher has a stronger influence on missing payment approval times compared to other types. This is likely a characteristic of this payment type.
- 14 delivered orders have missing payment approval times. All of these orders have the payment type boleto. This might be a characteristic of using boleto. All these orders were made in January and February 2017. There might have been a system failure, resulting in the payment approval times not being recorded.
- In November 2017, there was a spike in orders that do not have a delivery handover time to the carrier. This might be related to Black Friday.
- There are 2 delivered orders with missing delivery handover times to the carrier.
- A review rating of 1 is more strongly associated with missing delivery handover times and delivery times to the buyer. This might indicate that these orders were not delivered, and users are very dissatisfied. This is also evident from the review messages, which are mostly negative. The messages often mention that the order was not delivered.
- In the state of São Paulo, missing delivery handover times to the carrier are more pronounced.
- In November 2017, there was a spike in orders that do not have a delivery time to the buyer. This is likely related to Black Friday.
- There are 8 orders that have a status of delivered but are missing delivery times to the buyer. 7 of these were paid with a credit card, and one was paid with a debit card.
- There are 610 products with missing values in the product category name, product name length, product description length, and product photo count. All these missing values are in the same rows, meaning they are missing simultaneously across all these product characteristics.
- There are 2 products with missing values in the product length, height, width, and volume. All these missing values are in the same rows, meaning they are missing simultaneously across all these product characteristics.

---

**Order Status Anomalies**

- In March and April 2018, there was a sharp peak in orders stuck in the "shipped" status.
- In February and August 2018, there were peaks in the "canceled" status.
- In November 2017, there was a peak in the "unavailable" status. Black Friday occurred this month.
- 86% of orders stuck in the "processing" status have a rating of 1. 6% of orders have a rating of 2. Clearly, customers are dissatisfied. Based on review messages, orders were not delivered. Some reviews also mention that the product was out of stock.
- 74% of orders stuck in the "invoiced" status have a rating of 1. 9% of orders have a rating of 2. Clearly, customers are dissatisfied. Review messages indicate that orders were not delivered. Some reviews suggest the product was out of stock.
- 99 orders stuck in the "unavailable" status lack a category. This means they are not in the items table. This situation is more pronounced in the state of Sao Paulo compared to other states. Orders with the "boleto" payment type also stand out. 78% of orders with the "unavailable" status have a rating of 1. 8% have a rating of 2. Clearly, customers are dissatisfied. Review messages indicate that orders were not delivered. Some reviews suggest the product was out of stock.
- For orders stuck in the "canceled" status:
  - The process is interrupted at different stages. Most interruptions occur between payment approval and carrier handover.
  - From December 2017 to March 2018, there was a significant spike in interruptions after carrier handover. During this period, canceled orders had a carrier handover time but no delivery time, suggesting delivery issues.
  - In most months, interruptions occur after payment approval.
  - In August 2018, there was a sharp peak in orders canceled immediately after purchase, possibly due to payment issues.
  - The issue was more pronounced in the state of Sao Paulo and for the "boleto" payment type.
  - 69% of orders with the "canceled" status have a rating of 1. 7% have a rating of 2. Clearly, customers are dissatisfied. Review messages indicate that orders were not delivered. Some reviews suggest the product was out of stock.
- For orders stuck in the "shipped" status:
  - The issue is more pronounced in the state of Rio de Janeiro compared to other states.
  - 62% of orders with the "shipped" status have a rating of 1. 8% have a rating of 2. Clearly, customers are dissatisfied. Review messages indicate that most orders were not delivered.


---

**Status and Delivery Mismatches**

- Some orders are not marked as "delivered" but have a delivery date. Likely, these orders were canceled after delivery. Review messages indicate that orders were not delivered.
- There are 8 orders with the "delivered" status but no delivery time. Review messages suggest the products were delivered.
- There are 6 orders with the "canceled" status but a delivery time. Reviews indicate that some products were delivered, while others were not.

---

**Date Inconsistencies**

- There are 166 orders where the carrier handover time is earlier than the purchase time. These anomalies occurred only between April 25 and August 24, 2018. Possibly a system bug. Over 90% of these orders were paid with a credit card.
- There are 1359 orders where the carrier handover time is earlier than the payment approval time. Over 99% of these orders have the "delivered" status. Days with the highest number of anomalies: April 19–23, 2018, and July 3–4, 2018. Possible payment approval delays.
- There are 61 orders where the delivery time is earlier than the payment approval time. The issue is more pronounced for the "boleto" payment type. 87% of these anomalies occurred in the state of Sao Paulo.
- There are 23 orders where the delivery time is earlier than the carrier handover time.
- There are 65 orders where reviews were created before the orders themselves. Of these, 58 were canceled, 6 were delivered, and 1 has the "shipped" status.

---

**Other Anomalies**

- There are 9 payments with a zero amount. These have either the "voucher" or "not_defined" payment type. Possibly a payment feature.
- Over 1000 products were sold by multiple sellers located in different cities and states.
- 4 orders have an abnormally long carrier handover time, though their estimated delivery time is normal.
- 383 delivery fees are zero, possibly indicating free delivery. Most occurred between April and July 2018.
- 4 products have a zero weight. They belong to the "cama_mesa_banho" category (home textiles).
- Some orders were placed by customers with zip code prefixes outside South America.
- 7877 orders have a payment total that does not match the order total. Most of these have the "voucher" payment type, likely a feature of this payment method.
- The orders table contains one order not present in the payments table.
- The orders table contains 775 order IDs not present in the order items table: 603 "Unavailable," 164 "Canceled," 5 "Created," 2 "Invoiced," and 1 "Shipped."
- The customers table contains 157 zip_code_prefix values not present in the geolocation table.
- The geolocation table contains 4178 zip_code_prefix values not present in the customers table.
- The sellers table contains 7 zip_code_prefix values not present in the geolocation table.
- The geolocation table contains 16,776 zip_code_prefix values not present in the sellers table.

<h1 id="6"> 6 Recommendations</h1>

<h2 id="6-1"> 6.1 Product market fit</h2>

- Explore ways to increase retention. This is the most critical issue currently as retention is too low. Formulate and conduct A/B tests to improve retention.
- Analyze reviews from non-returning customers to gather feedback and identify the root causes of low retention. This is the most accurate way to understand why customers don't return.
- Improve positioning on external marketplaces like Amazon to ensure Olist appears in product cards. This could drive more external marketplace purchases.
- Develop promotions and discounts for repeat purchases to boost conversion rates for returning customers.
- Review the product assortment. The marketplace may lack desired products or suffer from quality issues.
- Study the customer experience on the website/app to identify potential pain points that may contribute to low retention and poor product/market fit.
- Find ways to increase the number of active buyers, as growth has stalled since 2018. Meanwhile, the number of sellers continues to grow, risking an imbalance.
- Reduce the time between first and second purchases, as currently ~50% of customers take more than 29 days to make a second purchase.

<h2 id="6-2"> 6.2 Customer loyalty Enhancement</h2>

**Implement a loyalty program for One-Time buyers:**

- Incentivize repeat purchases: discounts on second orders, loyalty points. 97% of buyers make only one purchase.
- Particularly target High-Spend customers—they have potential to become loyal.
- Offer discounts on previously purchased categories.

**Personalized communications for Potential Core customers:**

- This segment shows high metrics but hasn't transitioned to core loyalty.
- Provide exclusive offers in categories they've purchased before.
- Offer early access to sales.

**Premium service for Core customers:**

- Priority delivery, dedicated managers, special installment terms.

**Target critical segments:**

- Critics have high order values but low loyalty due to delivery issues.
- Offer personalized apologies and compensation for delayed orders.
- Create a "Second Chance" program with special offers.

**For Promoters:**

- Introduce a referral program.
- Offer exclusive discounts on frequently purchased categories.

<h2 id="6-3"> 6.3 Customer Experience Improvement</h2>

- Use CustDev to investigate causes of low review ratings (12% rated 1, 3% rated 2).
  - Focus on customers leaving 1- and 2-star ratings.
  - Categorize complaints: product quality, delivery, service, UI, etc.
  - Identify top pain points (e.g., 40% complain about slow delivery).
  - Focus on root causes, not just symptoms.
  - Prioritize fixes based on findings.
- Investigate why high-value orders receive more 1- and 2-star ratings—a clear problem in this segment.
- Improve UX for credit card and boleto payments (most used payment methods).
- Address issues for orders with multiple categories—they receive more 1- and 2-star ratings.
- Prioritize fixing 1-star ratings in Rio de Janeiro—a key market with disproportionately high poor ratings.

<h2 id="6-4"> 6.4 Logistics and Delivery Optimization</h2>

**Payment processing:**

- Reduce payment processing time (from order creation to approval).
  - 75% of orders take ≥14 hours; 5% take ≥48 hours.
  - Long processing harms UX.
- Optimize installment payment handling (heavier items, more delays).

**Payment approval to carrier handover:**

- Reduce this interval to speed up delivery and improve CX.
- Work with slow-to-ship sellers.

**Carrier delivery time:**

- Reduce carrier delivery time (71% of total delivery time).
- Collaborate with carriers to identify and fix delays.
- Focus on problematic regions.

**Delivery delays:**

- Investigate root causes (service issues, seller delays, logistics).
- Implement targeted fixes.

**Heavy order focus:**

- Heavy orders are more prone to delays and low ratings.
- Special handling: reinforced packaging, dedicated vehicles, detailed tracking.
- Leverage data: 75% of orders weigh <2kg—optimize logistics accordingly.
- Note: Rio de Janeiro has among the heaviest average order weights in top states.

**Premium product delivery:**

- Expedite delivery for high-value items—critical segment sensitive to delays.

**Special Attention to Lagging Regions:**

- Among the top states with the highest number of sales:
    - The top 3 states with the longest delivery times: Para, Maranhão, Ceará.
- Among the top cities with the highest number of sales:
    - The top 3 cities with the longest delivery times: Salvador, Porto Alegre, Rio de Janeiro.


<h2 id="6-5"> 6.5 Financial optimization</h2>

**For High-Spend segment:**

- Premium installment plans for orders >500 R$.
- Free insurance for heavy orders (>10 kg).

**Payment incentives:**

- Promote credit cards (75% of payments, high average order value).
  - Cashback/bonuses for card payments.
  - Partner bank exclusives.

**Boost installment adoption:**

- Installment orders have higher values and weights.
- Simplify application process.
- Partner with banks to reduce interest rates.

**Voucher strategy:**

- Voucher-only users have low average order values.
- Time-limited offers to convert vouchers to purchases.
- Bundled "voucher + discounted item" deals.

**Canceled order reduction:**

- Reduce credit card cancellations (70% of canceled orders).
- Address voucher payment issues (overrepresented in cancellations).

<h2 id="6-6"> 6.6 Average Order Value and Basket Optimization</h2>

- Increase AOV (currently stagnant at 150-170 R$).
- Boost average products per order (currently 1.12-1.16).
- Address issues with high-value, multi-item orders (disproportionate 1-2 star ratings—critical segment).
- Raise AOV in São Paulo (key market with low AOV).
- Increase products per order (90% of orders contain one item).
  - Free shipping thresholds for multi-item orders.
- Weekend AOV growth: Since weekends show weak AOV lift vs. weekdays, develop strategies to capitalize on typical weekend spending spikes—a potential growth lever.

<h2 id="6-7"> 6.7 Segment-Specific Strategies</h2>

- **Champions RFM segment (4% of total):** Implement VIP service for this high-value group. Retention is critical.
- **High-Spend customers:** Improve delivery speed (currently slow) and service quality to reduce critic complaints.
- **Low-Spend customers:** Encourage higher AOV through cross-category discounts.
- **Fast repeat purchasers:** Enhance post-purchase experience to maintain their engagement.
- **Seasonal buyers ("Quarterly" segment):** Send targeted reminders before peak seasons.
- **Promoters:** Leverage their positive ratings to drive repeat purchases through tailored incentives.
- **Critics:** Focus on logistics and service improvements - their high spending makes them valuable if issues are resolved.

<h2 id="6-8"> 6.8 Product and Assortment Management</h2>

**Growing categories:**

- Expand Beauty/Health and Home/Garden categories showing consistent growth in 2018
- Increase inventory depth in these segments
- Launch targeted advertising campaigns

**Top-performing categories:**

- Electronics (27%), Furniture (18%), Home & Garden (14%) require:
  - Extended assortment
  - Value-added services (e.g., furniture assembly)
  - Enhanced warranties

**Underperforming categories:**

- Review pricing strategy for Watches & Gifts (most price volatility)
- Address issues in Security & Services (lowest sales volume)

**Multi-item order incentives:**

- Implement bundle strategies:
  - Free shipping thresholds (e.g., 3+ items)
  - Curated product sets with discounts

**Dead stock management:**

- 2.5% of products have never sold - analyze and liquidate

<h2 id="6-9"> 6.9 Marketing and Promotion</h2>

**Geo-targeted campaigns:**

- **São Paulo (42% share):** Furniture-focused promotions
- **Rio de Janeiro (13%):** Electronics with expedited delivery options
- **Minas Gerais (12%):** Regional-specific deals
- Tactics:
  - "City Days" with localized discounts
  - Micro-influencer partnerships

**Peak hour optimization:**

- Evening focus (36% of orders occur 18-22h)
- Implement "Happy Hour" flash sales
- Increase ad spend during peak windows

**Dormant customer reactivation:**

- Automated email sequences for:
  - Hibernating (22%)
  - Lost (4%) segments
- Personalized win-back offers

<h2 id="6-10"> 6.10 Peak Season Preparedness</h2>

**Black Friday learnings:**

- Scale warehouse capacity pre-peak
- Hire temporary fulfillment staff
- Stress-test systems before major events

<h2 id="6-11"> 6.11 Analytics and Monitoring</h2>

**Deep-dive analyses:**

- Investigate 2018 growth plateau:
  - Likely caused by declining retention + slowing new customer acquisition
- Underperforming states:
  - Benchmark against top 5 states for best practices
- Nov 2017-Mar 2018 delivery delays:
  - Root cause analysis may reveal systemic issues

**Real-time dashboards tracking:**

- Revenue
- MAU
- First-month repeat purchase rate
- Retention Rate
- ARPPU
- New customer acquisition share
- Delivery times (geo-specific)
- NPS

**Order cancellation analysis:**

- Feb/Aug 2018 spikes require investigation:
  - Technical failures
  - Carrier issues
  - Payment processing problems

<h2 id="6-12"> 6.12 A/B Testing</h2>

**Strategic focus areas:**
- Retention improvement
- Repeat purchase frequency
- ARPPU and AOV growth
- Delivery timeline compression

**Testing best practices:**
- Clear hypothesis prioritization
- Proper metric selection (including guardrails)
- Statistical significance thresholds

<h2 id="6-13"> 6.13 Anomaly</h2>

- Identify and resolve the reasons for order delays at different stages. This can affect the customer experience and may be a software bug.
- Determine the cause of date discrepancies. Correct situations where the date of a subsequent order stage is earlier than the date of the previous stage.
- Investigate why there is a mismatch between order status and date. For example, an order with a status of delivered should have a delivery date, but this is not always the case.
- Verify product IDs that were sold by different sellers to ensure there are no software bugs.
- Investigate why there are cases where a single review is left for multiple orders. Ensure that this is not an anomaly.
