#                                                           Improving Delivery Rate Performance on Olist e-Commerce
Alpha Team - DTIDSOL-02



### **Contents**

1. Business Problem Understanding
2. Data Cleaning and Understanding
3. EDA
4. Data analysis
5. Summary
6. Conclusion 
7. Limitation
8. Recommendations

****

## Background and aim

Background and aim of analysis
Business problem identification 

## Import Libraries

In [5]:
# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import plotly.express as px
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import normaltest, chi2_contingency, mannwhitneyu, ttest_ind, kstest
import statsmodels.api as sm
import os

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_colwidth', None)

date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
    'olist_orders_delivered_clean.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv'
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list
    parse_dates = date_cols.get(filename, [])
    # Read the CSV, parsing the specified date columns (if any)
    return pd.read_csv(path, parse_dates=parse_dates)

In [6]:
# Load necessary datasets:
olist_geolocation_dataset = pd.read_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_geolocation_dataset.csv',delimiter=';')
olist_orders_dataset = read_olist_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_orders_dataset.csv')
olist_customers_dataset = pd.read_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_customers_dataset.csv')
olist_order_items_dataset = read_olist_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_order_items_dataset.csv')
olist_order_payments_dataset = read_olist_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_order_payments_dataset.csv')
olist_order_reviews_dataset = read_olist_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_order_reviews_dataset.csv')
olist_products_dataset = pd.read_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_products_dataset.csv')
olist_sellers_dataset = pd.read_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/olist_sellers_dataset.csv')
product_category_name_translation = pd.read_csv('/Users/josephinegunawan/Documents/GitHub/Final-Project-Olist/data/cleaned_data/product_category_name_translation.csv')


In [13]:
def merge_datasets():
    """
    Merges all Olist datasets into a single DataFrame.
    Returns:
        pd.DataFrame: Merged DataFrame containing all relevant information.
    """
    # Merge datasets step by step
    merged_df = olist_orders_dataset.merge(olist_order_items_dataset, on='order_id', how='left')
    merged_df = merged_df.merge(olist_order_payments_dataset, on='order_id', how='left')
    merged_df = merged_df.merge(olist_order_reviews_dataset, on='order_id', how='left')
    merged_df = merged_df.merge(olist_customers_dataset, on='customer_id', how='left')
    merged_df = merged_df.merge(olist_sellers_dataset, on='seller_id', how='left')
    merged_df = merged_df.merge(olist_products_dataset, on='product_id', how='left')
    merged_df = merged_df.merge(product_category_name_translation, on='product_category_name', how='left')
    
    return merged_df


display(merge_datasets().head(), merge_datasets().tail())

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,...,seller_state,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,1,87285b34884572647811a353c7ac498a,...,SP,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,1,87285b34884572647811a353c7ac498a,...,SP,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,1,87285b34884572647811a353c7ac498a,...,SP,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,1,595fac2a385ac33a80bd5114aec74eb8,...,SP,perfumaria,29.0,178.0,1.0,400.0,19.0,13.0,19.0,perfumery
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,1,aa4383b373c6aca5d8797843e5594415,...,SP,automotivo,46.0,232.0,1.0,420.0,24.0,19.0,21.0,auto


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,...,seller_state,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
115110,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02,1,f1d4ce8c6dd66c47bbaa8c6781c2a923,...,SP,bebes,52.0,828.0,4.0,4950.0,40.0,10.0,40.0,baby
115111,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27,1,b80910977a37536adeddd63663f916ad,...,SP,eletrodomesticos_2,51.0,500.0,2.0,13300.0,32.0,90.0,22.0,home_appliances_2
115112,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15,1,d1c427060a0f73f6b889a5c7c61f2ac4,...,MG,informatica_acessorios,59.0,1893.0,1.0,6550.0,20.0,20.0,20.0,computers_accessories
115113,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15,2,d1c427060a0f73f6b889a5c7c61f2ac4,...,MG,informatica_acessorios,59.0,1893.0,1.0,6550.0,20.0,20.0,20.0,computers_accessories
115114,66dea50a8b16d9b4dee7af250b4be1a5,edb027a75a1449115f6b43211ae02a24,delivered,2018-03-08 20:57:30,2018-03-09 11:20:28,2018-03-09 22:11:59,2018-03-16 13:08:30,2018-04-03,1,006619bbed68b000c8ba3f8725d5409e,...,SP,beleza_saude,45.0,569.0,1.0,150.0,16.0,7.0,15.0,health_beauty


In [14]:
merged_df = merge_datasets()

In [15]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115115 entries, 0 to 115114
Data columns (total 40 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       115115 non-null  object        
 1   customer_id                    115115 non-null  object        
 2   order_status                   115115 non-null  object        
 3   order_purchase_timestamp       115115 non-null  datetime64[ns]
 4   order_approved_at              115100 non-null  datetime64[ns]
 5   order_delivered_carrier_date   115114 non-null  datetime64[ns]
 6   order_delivered_customer_date  115115 non-null  datetime64[ns]
 7   order_estimated_delivery_date  115115 non-null  datetime64[ns]
 8   order_item_id                  115115 non-null  int64         
 9   product_id                     115115 non-null  object        
 10  seller_id                      115115 non-null  object        
 11  

In [17]:
print(merged_df.shape)           
print(merged_df.columns)        
print(merged_df.info())          


(115115, 40)
Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'review_id', 'review_score',
       'review_comment_title', 'review_comment_message',
       'review_creation_date', 'review_answer_timestamp', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'product_category_name_english'],
      dtype='object')
<class 'pandas.co