In [1]:
import pandas as pd
import numpy as np

In [2]:
orders = pd.read_csv("olist_orders_dataset.csv")
order_items = pd.read_csv("olist_order_items_dataset.csv")
payments = pd.read_csv("olist_order_payments_dataset.csv")
reviews = pd.read_csv("olist_order_reviews_dataset.csv")
customers = pd.read_csv("olist_customers_dataset.csv")
products = pd.read_csv("olist_products_dataset.csv")
sellers = pd.read_csv("olist_sellers_dataset.csv")
category_map = pd.read_csv("product_category_name_translation.csv")
# importing all the datasets

In [3]:
orders.shape

(99441, 8)

In [4]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB


In [5]:
orders.isna().sum()
# we can find null values in 3 columns ("order_approved_at",'order_delivered_carrier_date','order_delivered_customer_date')

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

#Understanding the Data

In [6]:
orders['order_status'].value_counts(normalize=True)*100
# we can see the percentage of orders in different stages
# we can see a total of ~1.2% of the orders are cancelled /unavailable which shows the origin of losses

order_status
delivered      97.020344
shipped         1.113223
canceled        0.628513
unavailable     0.612423
invoiced        0.315765
processing      0.302692
created         0.005028
approved        0.002011
Name: proportion, dtype: float64

In [7]:
date_cols = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

for col in date_cols:
    orders[col] = pd.to_datetime(orders[col], errors='coerce')
    # converting the object fields to date time.

In [8]:
orders['delivery_delay_days']=(orders['order_delivered_customer_date']-orders['order_estimated_delivery_date']).dt.days

In [9]:
orders['delivery_delay_days'].describe()
# we can see an average of 12 days to deliver a package and even went up to 188days at the Max

count    96476.000000
mean       -11.876881
std         10.183854
min       -147.000000
25%        -17.000000
50%        -12.000000
75%         -7.000000
max        188.000000
Name: delivery_delay_days, dtype: float64

In [10]:
# when delay in delivery increases the reviews gradually go Down

In [11]:
orders_reviews = orders.merge(
    reviews[['order_id', 'review_score']],
    on='order_id',
    how='left'
)

orders_reviews.groupby('review_score')['delivery_delay_days'].mean()

review_score
1.0    -4.060580
2.0    -8.634818
3.0   -10.774052
4.0   -12.380840
5.0   -13.388153
Name: delivery_delay_days, dtype: float64

In [12]:
# As per result it is completely opposite ,so lets check for the delayed data

In [13]:
late_orders = orders_reviews[orders_reviews['delivery_delay_days'] > 0]

late_orders.groupby('review_score')['delivery_delay_days'].mean()


review_score
1.0    12.360337
2.0    10.233813
3.0     9.058739
4.0     8.414110
5.0     6.989623
Name: delivery_delay_days, dtype: float64

In [14]:
# it is confirmed that when the delay goes up the reviews goes down

In [16]:
customer_order_counts=orders.groupby('customer_id')['order_id'].count()
customer_order_counts.value_counts().head(10)

order_id
1    99441
Name: count, dtype: int64

In [20]:
customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [None]:
orders_customers = orders.merge(
    customers[['customer_id', 'customer_unique_id']],on='customer_id',how='left')


In [25]:
customer_order_counts = orders_customers.groupby('customer_unique_id')['order_id'].count()
customer_order_counts.value_counts().head(10)


order_id
1     93099
2      2745
3       203
4        30
5         8
6         6
7         3
9         1
17        1
Name: count, dtype: int64

In [24]:
order_items[['seller_id','product_id']].nunique()
# we can see that there are 3095 sellers and 32951 unique products

seller_id      3095
product_id    32951
dtype: int64