In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../data/olist.db')
print("✓ Connected to SQLite database")


✓ Connected to SQLite database


In [2]:
query = """
WITH first_purchase AS (
    SELECT
        customer_unique_id,
        MIN(strftime('%Y-%m', order_purchase_timestamp)) AS cohort_month
    FROM master_orders
    GROUP BY customer_unique_id
),

customer_orders AS (
    SELECT
        m.customer_unique_id,
        strftime('%Y-%m', m.order_purchase_timestamp) AS order_month,
        f.cohort_month
    FROM master_orders m
    JOIN first_purchase f
        ON m.customer_unique_id = f.customer_unique_id
)

SELECT
    cohort_month,
    order_month,
    COUNT(DISTINCT customer_unique_id) AS active_customers
FROM customer_orders
GROUP BY cohort_month, order_month
ORDER BY cohort_month, order_month
LIMIT 20;
"""

cohort_df = pd.read_sql_query(query, conn)
cohort_df.head(10)


Unnamed: 0,cohort_month,order_month,active_customers
0,2016-10,2016-10,262
1,2016-10,2017-04,1
2,2016-10,2017-07,1
3,2016-10,2017-09,1
4,2016-10,2017-11,1
5,2016-10,2018-01,1
6,2016-10,2018-03,1
7,2016-10,2018-05,2
8,2016-10,2018-06,2
9,2016-12,2016-12,1


In [3]:
query = """
WITH customer_revenue AS (
    SELECT
        customer_unique_id,
        SUM(total_payment) AS revenue
    FROM master_orders
    GROUP BY customer_unique_id
),

ranked AS (
    SELECT
        customer_unique_id,
        revenue,
        NTILE(5) OVER (ORDER BY revenue DESC) AS revenue_bucket
    FROM customer_revenue
)

SELECT
    revenue_bucket,
    COUNT(*)                        AS customers,
    ROUND(SUM(revenue), 2)          AS total_revenue,
    ROUND(AVG(revenue), 2)          AS avg_revenue
FROM ranked
GROUP BY revenue_bucket
ORDER BY revenue_bucket;
"""

pareto_df = pd.read_sql_query(query, conn)
pareto_df


Unnamed: 0,revenue_bucket,customers,total_revenue,avg_revenue
0,1,18672,8254874.14,442.1
1,2,18672,3083941.53,165.16
2,3,18671,2025857.66,108.5
3,4,18671,1316651.05,70.52
4,5,18671,741137.39,39.69


In [4]:
query = """
SELECT
    is_late,
    COUNT(*)                     AS total_orders,
    ROUND(AVG(review_score), 2)  AS avg_review_score,
    ROUND(AVG(total_payment), 2) AS avg_order_value
FROM master_orders
GROUP BY is_late
ORDER BY is_late;
"""

late_df = pd.read_sql_query(query, conn)
late_df


Unnamed: 0,is_late,total_orders,avg_review_score,avg_order_value
0,0,88652,4.29,158.72
1,1,7825,2.57,172.73


In [7]:
import sqlite3
import pandas as pd

# Connect to database (go one level up, then into data folder)
conn = sqlite3.connect('../data/olist.db')

# Read SQL query from file (go one level up, then into sql folder)
with open('../sql/rfm_scoring.sql', 'r') as file:
    query = file.read()

# Execute query and load into dataframe
df = pd.read_sql_query(query, conn)

# Close connection
conn.close()

# Show first few rows
df.head()


Unnamed: 0,customer_unique_id,recency_days,frequency,monetary,r_score,f_score,m_score,rfm_total,customer_segment
0,ba09cdff04586794fc557641afb2bd1a,162.0,1,189.06,4,4,4,12,Champions
1,68024612aa18d6e8c5278f31060dd079,162.0,1,189.19,4,4,4,12,Champions
2,f442c71d123cbc47ef86cc8fdd240ec8,162.0,1,190.47,4,4,4,12,Champions
3,e8f730c94f50e6b91bb5c5b2f923ea3e,162.0,1,192.61,4,4,4,12,Champions
4,9d30f7b1f1ec3287b26252ffabc07ec1,162.0,1,193.51,4,4,4,12,Champions


In [8]:
df['customer_segment'].value_counts()


customer_segment
Loyal Customers        34383
Potential Loyalists    23663
Champions              23588
At Risk                11723
Name: count, dtype: int64

In [9]:
df.groupby('customer_segment')['monetary'].sum().sort_values(ascending=False)


customer_segment
Champions              7706439.98
Loyal Customers        5690407.65
Potential Loyalists    1512704.40
At Risk                 512909.74
Name: monetary, dtype: float64

In [10]:
segment_revenue = df.groupby('customer_segment')['monetary'].sum()
segment_revenue / segment_revenue.sum() * 100


customer_segment
At Risk                 3.325732
Champions              49.968936
Loyal Customers        36.896883
Potential Loyalists     9.808450
Name: monetary, dtype: float64