In [3]:
import numpy as np # linear algebra
import pandas as pd
import sqlite3

# Loading Data

In [4]:
df_olist_customers = pd.read_csv('data/olist_customers_dataset.csv')
df_olist_sellers = pd.read_csv('data/olist_sellers_dataset.csv')
df_olist_order_reviews= pd.read_csv('data/olist_order_reviews_dataset.csv')
df_olist_order_items= pd.read_csv('data/olist_order_items_dataset.csv')
df_olist_products= pd.read_csv('data/olist_products_dataset.csv')
df_olist_geolocation= pd.read_csv('data/olist_geolocation_dataset.csv')
df_product_category_name_translation= pd.read_csv('data/product_category_name_translation.csv')
df_olist_orders = pd.read_csv('data/olist_orders_dataset.csv')
df_olist_order_payments= pd.read_csv('data/olist_order_payments_dataset.csv')

df_olist_order_payments.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


# Create SQLite Database and Export DataFrames.

In [7]:
# Create a SQLite database using SQLAlchemy
# Export each dataframe as a table to the SQLite database
from sqlalchemy import create_engine
from sqlalchemy import inspect

# Create a persistent SQLite DB file
engine = create_engine('sqlite:///olist.db', echo=False)

# Export all dataframes into the DB
df_olist_customers.to_sql("olist_customers", con=engine, index=False)
df_olist_sellers.to_sql("olist_sellers", con=engine, index=False)
df_olist_order_reviews.to_sql("olist_order_reviews", con=engine, index=False)
df_olist_order_items.to_sql("olist_order_items", con=engine, index=False)
df_olist_products.to_sql("olist_products", con=engine, index=False)
df_olist_geolocation.to_sql("olist_geolocation", con=engine, index=False)
df_product_category_name_translation.to_sql("product_category_name_translation", con=engine, index=False)
df_olist_orders.to_sql("olist_orders", con=engine, index=False)
df_olist_order_payments.to_sql("olist_order_payments", con=engine, index=False)

# Check
inspector = inspect(engine)
print(inspector.get_table_names())

['olist_customers', 'olist_geolocation', 'olist_order_items', 'olist_order_payments', 'olist_order_reviews', 'olist_orders', 'olist_products', 'olist_sellers', 'product_category_name_translation']


Query 1: Count and Percentage of Orders Purchased in Jan 2018 with 5 Review Score

In [6]:
# Write and execute a SQL query to count the number of orders purchased in January 2018 that have a review score of 5
# Calculate the percentage of such orders.
query = """
WITH orders AS (
    SELECT
        order_id,
        customer_id,
        order_status,
        strftime('%m', order_purchase_timestamp) AS purchase_month,
        strftime('%Y', order_purchase_timestamp) AS purchase_year
    FROM olist_orders
),
filtered AS (
    SELECT o.purchase_year, o.order_id
    FROM orders o
    JOIN olist_order_reviews r
      ON o.order_id = r.order_id
    WHERE r.review_score = 5
      AND o.purchase_year = '2018'
      AND o.purchase_month = '01'
)
SELECT
    purchase_year,
    COUNT(*) AS purchase_orders_count,
    COUNT(*) * 100 / (SELECT COUNT(*) FROM orders) AS percentage
FROM filtered
GROUP BY purchase_year;
"""
df = pd.read_sql(query, engine)
print(df)

  purchase_year  purchase_orders_count  percentage
0          2018                   4097           4


Query 2: Customer Purchase Trend Year-on-Year

In [7]:
# Write and execute a SQL query to analyze the customer purchase trend year-on-year.
query = '''
SELECT 
    strftime('%Y', o.order_purchase_timestamp) AS purchase_year,
    COUNT(*) AS total_orders,
    COUNT(DISTINCT o.customer_id) AS unique_customers,
    AVG(p.payment_value) AS avg_payment_per_order,
    SUM(p.payment_value) AS total_revenue
FROM olist_orders o
JOIN olist_order_payments p ON o.order_id = p.order_id
GROUP BY purchase_year
ORDER BY purchase_year;
'''

df = pd.read_sql(query, engine)
print(df)

  purchase_year  total_orders  unique_customers  avg_payment_per_order  \
0          2016           346               328             171.567457   
1          2017         47525             45101             152.545960   
2          2018         56015             54011             155.311310   

   total_revenue  
0       59362.34  
1     7249746.73  
2     8699763.05  


Query 3: Average Order Values of Customers

In [8]:
# Write and execute a SQL query to calculate the average order values of customers.
query = '''
SELECT
    c.customer_unique_id,
    AVG(p.payment_value) AS average_order_value
FROM olist_customers c
JOIN olist_orders o ON c.customer_id = o.customer_id
JOIN olist_order_payments p ON o.order_id = p.order_id
GROUP BY c.customer_unique_id
ORDER BY average_order_value DESC;
'''
df = pd.read_sql(query, engine)
print(df)

                     customer_unique_id  average_order_value
0      0a0a92112bd4c708ca5fde585afaa872         13664.080000
1      763c8b1c9c68a0229c42c9fc6f662b93          7274.880000
2      dc4802a71eae9be1dd28f5d788ceb526          6929.310000
3      459bef486812aa25204be022145caa62          6922.210000
4      ff4159b92c40ebe40454e3e6a7c35ed6          6726.660000
...                                 ...                  ...
96090  2bca5fe01d46ab5cc2bc5aaade88d850             3.166667
96091  569aa12b73b5f7edeaa6f2a01603e381             2.410769
96092  2524dcec233c3766f2c2b22f69fd65f4             1.856818
96093  968fac81e2c44fb6c1e3ac2a45e6a102             0.000000
96094  4fa4365000c7090fcb8cad5713c6d3db             0.000000

[96095 rows x 2 columns]


Query 4: Top 5 Cities with Highest Revenue from 2016 to 2018

In [9]:

query = '''
WITH orders AS(
    SELECT order_id, customer_id
    FROM olist_orders
    WHERE strftime('%Y', order_purchase_timestamp) IN ('2016', '2017', '2018')
)
SELECT
    c.customer_city AS city,
    SUM(p.payment_value) AS total_revenue
FROM olist_customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN olist_order_payments p ON o.order_id = p.order_id
GROUP BY c.customer_city
ORDER BY total_revenue DESC
LIMIT 5;
'''

df = pd.read_sql(query, engine)
print(df)

             city  total_revenue
0       sao paulo     2203373.09
1  rio de janeiro     1161927.36
2  belo horizonte      421765.12
3        brasilia      354216.78
4        curitiba      247392.48


Query 5: State Wise Revenue Table Between 2016 to 2018

In [10]:
# Write and execute a SQL query to create a state-wise revenue table between 2016 to 2018
query = '''
WITH orders AS(
    SELECT order_id, customer_id
    FROM olist_orders
    WHERE strftime('%Y', order_purchase_timestamp) IN ('2016', '2017', '2018')
)
SELECT
    c.customer_state AS state,
    SUM(p.payment_value) AS total_revenue
FROM olist_customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN olist_order_payments p ON o.order_id = p.order_id
GROUP BY c.customer_state
ORDER BY total_revenue DESC
'''

df = pd.read_sql(query, engine)
print(df)


   state  total_revenue
0     SP     5998226.96
1     RJ     2144379.69
2     MG     1872257.26
3     RS      890898.54
4     PR      811156.38
5     SC      623086.43
6     BA      616645.82
7     DF      355141.08
8     GO      350092.31
9     ES      325967.55
10    PE      324850.44
11    CE      279464.03
12    PA      218295.85
13    MT      187029.29
14    MA      152523.02
15    PB      141545.72
16    MS      137534.84
17    PI      108523.97
18    RN      102718.13
19    AL       96962.06
20    SE       75246.25
21    TO       61485.33
22    RO       60866.20
23    AM       27966.93
24    AC       19680.62
25    AP       16262.80
26    RR       10064.62


Query 6: Top Successful Sellers in Terms of Goods Sold, Revenue, and Customer Count

In [11]:
# Write and execute a SQL query to identify the top successful sellers in terms of the 
# Number of goods sold, Total revenue, Customer count, and sellers with the highest count of 5-star ratings.
query = '''
WITH seller_agg AS (
    SELECT
        s.seller_id AS seller,
        COUNT(DISTINCT o.order_id) AS total_goods_sold,
        SUM(i.price) AS total_revenue,
        COUNT(DISTINCT o.customer_id) AS customer_count,
        COUNT(DISTINCT CASE WHEN r.review_score = 5 THEN o.order_id END) AS five_star_reviews
    FROM olist_orders o
    JOIN olist_order_items i ON o.order_id = i.order_id
    JOIN olist_sellers s ON i.seller_id = s.seller_id
    LEFT JOIN olist_order_reviews r ON o.order_id = r.order_id
    GROUP BY s.seller_id
),
ranked AS (
    SELECT
        *,
        RANK() OVER (ORDER BY total_goods_sold DESC) AS rank_total_goods,
        RANK() OVER (ORDER BY total_revenue DESC) AS rank_total_revenue,
        RANK() OVER (ORDER BY customer_count DESC) AS rank_total_customers,
        RANK() OVER (ORDER BY five_star_reviews DESC) AS rank_total_reviews
    FROM seller_agg
)
SELECT
    seller,
    total_goods_sold,
    total_revenue,
    customer_count,
    five_star_reviews
FROM ranked
WHERE 
       rank_total_goods = 1
    OR rank_total_revenue = 1
    OR rank_total_customers = 1
    OR rank_total_reviews = 1
ORDER BY rank_total_customers;
'''

df = pd.read_sql(query, engine)
print(df)

                             seller  total_goods_sold  total_revenue  \
0  6560211a19b47992c3666cc44a7e94c0              1854      123585.82   
1  cc419e0650a3c5ba77189a1882b7556a              1706      106555.98   
2  4869f7a5dfa277a7dca6462dcf3b52b2              1132      229472.63   

   customer_count  five_star_reviews  
0            1854                945  
1            1706                996  
2            1132                671  


Query 7: Delivery Success Rate Across States

In [12]:
# Write and execute a SQL query to calculate the delivery success rate across different states.
query = '''
WITH orders AS (
    SELECT
        s.seller_state,
        COUNT(DISTINCT o.order_id) AS total_orders,
        COUNT(DISTINCT CASE WHEN o.order_status = 'delivered' THEN o.order_id END) AS delivered_orders
    FROM olist_orders o
    JOIN olist_order_items i ON o.order_id = i.order_id
    JOIN olist_sellers s ON i.seller_id = s.seller_id
    GROUP BY s.seller_state
)
SELECT
    seller_state,
    total_orders,
    delivered_orders,
    ROUND(1.0 * delivered_orders / total_orders, 4)*100 AS success_rate
FROM orders
ORDER BY total_orders DESC
'''

df = pd.read_sql(query, engine)
print(df)

   seller_state  total_orders  delivered_orders  success_rate
0            SP         70188             68641         97.80
1            MG          7930              7735         97.54
2            PR          7673              7512         97.90
3            RJ          4353              4227         97.11
4            SC          3667              3603         98.25
5            RS          1989              1962         98.64
6            DF           824               808         98.06
7            BA           569               550         96.66
8            GO           463               451         97.41
9            PE           406               403         99.26
10           MA           392               389         99.23
11           ES           318               310         97.48
12           MT           137               136         99.27
13           CE            91                87         95.60
14           RN            51                51        100.00
15      

Query 8: Preferred Form of Payment for Different Categories

In [13]:
query = '''
WITH ranked AS (
    SELECT
        p.product_category_name AS category,
        op.payment_type,
        RANK() OVER (PARTITION BY p.product_category_name ORDER BY COUNT(*) DESC) AS rank,
        COUNT(DISTINCT o.order_id) AS count
    FROM olist_orders o
    JOIN olist_order_payments op ON o.order_id = op.order_id
    JOIN olist_order_items i ON o.order_id = i.order_id
    JOIN olist_products p ON i.product_id = p.product_id
    WHERE p.product_category_name IS NOT NULL
    GROUP BY p.product_category_name, op.payment_type
)
SELECT
    category,
    payment_type,
    count
FROM ranked
WHERE rank = 1
'''

df = pd.read_sql(query, engine)
print(df)

                     category payment_type  count
0   agro_industria_e_comercio  credit_card    125
1                   alimentos  credit_card    328
2           alimentos_bebidas  credit_card    164
3                       artes  credit_card    147
4          artes_e_artesanato  credit_card     13
..                        ...          ...    ...
69    sinalizacao_e_seguranca  credit_card     94
70   tablets_impressao_imagem  credit_card     62
71                  telefonia  credit_card   3108
72             telefonia_fixa  credit_card    162
73      utilidades_domesticas  credit_card   4613

[74 rows x 3 columns]


Query 9: Distance Between Cities

In [None]:
query = '''
WITH city_coords AS (
    SELECT
        geolocation_city AS city,
        geolocation_state AS state,
        AVG(geolocation_lat) AS lat,
        AVG(geolocation_lng) AS lng
    FROM olist_geolocation
    GROUP BY geolocation_city, geolocation_state
)
SELECT
    a.city AS city_a,
    a.state AS state_a,
    b.city AS city_b,
    b.state AS state_b,
    ROUND(
        6371 * acos(
            cos(radians(a.lat)) * cos(radians(b.lat)) *
            cos(radians(b.lng - a.lng)) +
            sin(radians(a.lat)) * sin(radians(b.lat))
        ),
    2) AS distance_km
FROM city_coords a
JOIN city_coords b
    ON a.city < b.city
ORDER BY distance_km ASC;
'''

df = pd.read_sql(query, engine)
print(df)