In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sqlite3
print('SQLite version: ', sqlite3.sqlite_version)
import warnings; 
warnings.simplefilter('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

SQLite version:  3.37.2
/kaggle/input/ecommerce-data/Sales target.csv
/kaggle/input/ecommerce-data/Order Details.csv
/kaggle/input/ecommerce-data/List of Orders.csv


The dataset is retrieved from <a href="https://www.kaggle.com/datasets/benroshan/ecommerce-data">this page</a>.</br>
**Context**
<ul>
<li>List of Orders: This dataset contains purchase information. The information includes ID, Date of Purchase and customer details.</li>
<li>Order Details: This dataset contains order ID, with the order price, quantity,profit, category and subcategory of product</li>
<li>Sales target: This dataset contains sales target amount and date for each product category</li>
</ul>

In [2]:
order_list_df = pd.read_csv('/kaggle/input/ecommerce-data/List of Orders.csv')
order_details_df = pd.read_csv('/kaggle/input/ecommerce-data/Order Details.csv')
sales_target_df = pd.read_csv('/kaggle/input/ecommerce-data/Sales target.csv')

In [3]:
conn = sqlite3.connect('ecommerce.db')
order_list_df.to_sql('order_list', conn, if_exists = 'replace', index = False)
order_details_df.to_sql('order_details', conn, if_exists = 'replace', index = False)
sales_target_df.to_sql('sales_target', conn, if_exists = 'replace', index = False)

36

We first show the first 5 rows of all the tables:

In [4]:
TABLES = ['order_list', 'order_details', 'sales_target']

def format_table(column_names, rows):
    print([f"{cname : ^15}" for cname in column_names])
    print('-' * 20 * len(column_names))
    for row in rows:
        print([f"{entry : ^15}" for entry in row])
    print('\n\n')

def show_tables(tables = TABLES):
    for table in tables:
        cur = conn.cursor()
        sql = f"SELECT * FROM {table} LIMIT 5"
        cur.execute(sql)
        rows = cur.fetchall()
        column_names = list(map(lambda x : x[0], cur.description))
        print(' ' * 40 + table.upper() + '\n')
        print([f"{cname : ^15}" for cname in column_names])
        print('-' * 20 * len(column_names))
        for row in rows:
            print([f"{entry : ^15}" for entry in row])
        print('\n\n')
        cur.close()

show_tables()

                                        ORDER_LIST

['   Order ID    ', '  Order Date   ', ' CustomerName  ', '     State     ', '     City      ']
----------------------------------------------------------------------------------------------------
['    B-25601    ', '  01-04-2018   ', '    Bharat     ', '    Gujarat    ', '   Ahmedabad   ']
['    B-25602    ', '  01-04-2018   ', '     Pearl     ', '  Maharashtra  ', '     Pune      ']
['    B-25603    ', '  03-04-2018   ', '     Jahan     ', 'Madhya Pradesh ', '    Bhopal     ']
['    B-25604    ', '  03-04-2018   ', '    Divsha     ', '   Rajasthan   ', '    Jaipur     ']
['    B-25605    ', '  05-04-2018   ', '    Kasheen    ', '  West Bengal  ', '    Kolkata    ']



                                        ORDER_DETAILS

['   Order ID    ', '    Amount     ', '    Profit     ', '   Quantity    ', '   Category    ', ' Sub-Category  ']
--------------------------------------------------------------------------------------------------

I prefer to format the column names with the snake case concention:

In [5]:
sql_rename = """
ALTER TABLE order_list RENAME COLUMN 'Order ID' TO order_id;
ALTER TABLE order_list RENAME COLUMN 'Order Date' TO order_date;
ALTER TABLE order_list RENAME COLUMN 'CustomerName' TO customer_name;
ALTER TABLE order_list RENAME COLUMN 'State' TO  state;
ALTER TABLE order_list RENAME COLUMN 'City' TO city;

--Rename ORDER_DETAILS columns
ALTER TABLE order_details RENAME COLUMN 'Order ID' TO order_id;
ALTER TABLE order_details RENAME COLUMN 'Amount' TO amount;
ALTER TABLE order_details RENAME COLUMN 'Profit' TO profit;
ALTER TABLE order_details RENAME COLUMN 'Quantity' TO quantity;
ALTER TABLE order_details RENAME COLUMN 'Category' TO category;
ALTER TABLE order_details RENAME COLUMN 'Sub-Category' TO sub_category;

--Rename SALES_TARGET columns
ALTER TABLE sales_target RENAME COLUMN 'Month of Order Date' TO month_of_order;
ALTER TABLE sales_target RENAME COLUMN 'Category' TO category;
ALTER TABLE sales_target RENAME COLUMN 'Target' TO target;
"""

cur = conn.cursor()
cur.executescript(sql_rename)
conn.commit()
cur.close()
show_tables()

                                        ORDER_LIST

['   order_id    ', '  order_date   ', ' customer_name ', '     state     ', '     city      ']
----------------------------------------------------------------------------------------------------
['    B-25601    ', '  01-04-2018   ', '    Bharat     ', '    Gujarat    ', '   Ahmedabad   ']
['    B-25602    ', '  01-04-2018   ', '     Pearl     ', '  Maharashtra  ', '     Pune      ']
['    B-25603    ', '  03-04-2018   ', '     Jahan     ', 'Madhya Pradesh ', '    Bhopal     ']
['    B-25604    ', '  03-04-2018   ', '    Divsha     ', '   Rajasthan   ', '    Jaipur     ']
['    B-25605    ', '  05-04-2018   ', '    Kasheen    ', '  West Bengal  ', '    Kolkata    ']



                                        ORDER_DETAILS

['   order_id    ', '    amount     ', '    profit     ', '   quantity    ', '   category    ', ' sub_category  ']
--------------------------------------------------------------------------------------------------

In [6]:
sql_combined_view = """
CREATE VIEW IF NOT EXISTS order_combined AS
SELECT d.order_id, d.amount, d.profit, d.quantity, d.category, d.sub_category, 
DATE(substr(l.order_date, -4, 4)||'-'||substr(l.order_date, 4, 2)||'-'||substr(l.order_date, 1, 2)) as order_date, 
l.customer_name, l.state, l.city
FROM order_details as d
INNER JOIN order_list as l
ON d.order_id  = l.order_id;
"""
cur = conn.cursor()
cur.executescript(sql_combined_view)
conn.commit()
cur.close()

<h2>QUERY 1: How many...?</h2>

As a first query I just want to get an idea of the size of traffic we're dealing, I'll first count the unique instances of customer, orders, states and cities:

In [7]:
sql_count = """
SELECT COUNT(DISTINCT customer_name) as num_customers,
	   COUNT(DISTINCT order_id) as num_orders,
	   COUNT(DISTINCT state) as num_states,
	   COUNT(DISTINCT city) as num_city
FROM order_combined;
"""

cur = conn.cursor()
cur.execute(sql_count)
rows = cur.fetchall()
column_names = list(map(lambda x : x[0], cur.description))
format_table(column_names, rows)
cur.close()

[' num_customers ', '  num_orders   ', '  num_states   ', '   num_city    ']
--------------------------------------------------------------------------------
['      332      ', '      500      ', '      19       ', '      24       ']





<h2>QUERY 2: Amount/Profit/Quantity over time</h2>

We aim to check the sales trend, therefore we monitor the total values of amount (proceeds), profit and quantity over time:

In [8]:
sql_ts = """
SELECT CAST(substr(order_date, 1, 4) AS INTEGER) as year,
CAST(substr(order_date, 6, 2) AS INTEGER) as month,
SUM(amount) as total_amount,
SUM(profit) as total_profit,
SUM(quantity) as total_quantity
FROM order_combined
GROUP BY year, month;
"""

cur = conn.cursor()
cur.execute(sql_ts)
rows = cur.fetchall()
column_names = list(map(lambda x : x[0], cur.description))
format_table(column_names, rows)
cur.close()

['     year      ', '     month     ', ' total_amount  ', ' total_profit  ', 'total_quantity ']
----------------------------------------------------------------------------------------------------
['     2018      ', '       4       ', '    32726.0    ', '    -3960.0    ', '      389      ']
['     2018      ', '       5       ', '    28545.0    ', '    -3584.0    ', '      423      ']
['     2018      ', '       6       ', '    23658.0    ', '    -4970.0    ', '      369      ']
['     2018      ', '       7       ', '    12966.0    ', '    -2138.0    ', '      240      ']
['     2018      ', '       8       ', '    30899.0    ', '    -2180.0    ', '      446      ']
['     2018      ', '       9       ', '    26628.0    ', '    -4963.0    ', '      331      ']
['     2018      ', '      10       ', '    31615.0    ', '    3093.0     ', '      419      ']
['     2018      ', '      11       ', '    48086.0    ', '    11619.0    ', '      578      ']
['     2018      ', '      12      

<h2>QUERY 3: Performance by state</h2>

A possible question would be to check which state are more profitable by sales, that is which states generate more revenue for each transaction:

In [9]:
sql_state = """
SELECT state, 
	   SUM(profit) as total_profit,
	   SUM(quantity) as total_quantity,
	   ROUND(CAST(SUM(profit) AS FLOAT) / SUM(quantity), 2) as profit_per_item
FROM order_combined
GROUP BY state
ORDER BY profit_per_item DESC;
"""

cur = conn.cursor()
cur.execute(sql_state)
rows = cur.fetchall()
column_names = list(map(lambda x : x[0], cur.description))
format_table(column_names, rows)
cur.close()

['     state     ', ' total_profit  ', 'total_quantity ', 'profit_per_item']
--------------------------------------------------------------------------------
['    Haryana    ', '    1325.0     ', '      111      ', '     11.94     ']
['    Kerala     ', '    1871.0     ', '      157      ', '     11.92     ']
['  West Bengal  ', '    2500.0     ', '      216      ', '     11.57     ']
[' Uttar Pradesh ', '    3237.0     ', '      288      ', '     11.24     ']
['     Delhi     ', '    2987.0     ', '      277      ', '     10.78     ']
['  Maharashtra  ', '    6176.0     ', '     1056      ', '     5.85      ']
['Himachal Pradesh', '     656.0     ', '      113      ', '     5.81      ']
['   Rajasthan   ', '    1257.0     ', '      282      ', '     4.46      ']
['    Sikkim     ', '     401.0     ', '      93       ', '     4.31      ']
['Madhya Pradesh ', '    5551.0     ', '     1360      ', '     4.08      ']
['   Karnataka   ', '     645.0     ', '      180      ', '     3.58   

<h2>QUERY 4: RFM customer segmentation</h2>

Finally, we perform a RFM segmentation on the customer.</br>
RFM analysis is a technique used by marketers to identify the most valuable customer, allowing them to tailor marketing, sales-driven strategies to boost lifetime value and loyalty.</br>
To each customer is assigned a RFM score based on:
<ul>
    <li>RECENCY: how recently a customer has made a purchase.</li>
    <li>FREQUENCY: how often a customer make purchases.</li>
    <li>Monetary: hoe much a customer spends.</li>
</ul>
Higher scores represents more engaged, valuable customers.

In [10]:
sql_rfm = """
WITH customer_data AS(
SELECT 
customer_name,
julianday('2019-03-31') - MAX(julianday(order_date)) as recency,
COUNT(DISTINCT order_id) AS frequency,
SUM(amount) as monetary
FROM order_combined
GROUP BY customer_name
)
SELECT 
customer_name,
R,F,M, 
(R+F+M)/3.0 as RFM 
FROM (
SELECT
customer_name,
NTILE(5) OVER (ORDER BY recency DESC) AS R,
NTILE(5) OVER (ORDER BY frequency ASC) AS F,
NTILE(5) OVER (ORDER BY monetary ASC) AS M
FROM customer_data)
ORDER BY RFM DESC;
"""

cur = conn.cursor()
cur.execute(sql_rfm)
#conn.commit()
rows = cur.fetchall()
column_names = list(map(lambda x : x[0], cur.description))
format_table(column_names, rows)
cur.close()

[' customer_name ', '       R       ', '       F       ', '       M       ', '      RFM      ']
----------------------------------------------------------------------------------------------------
['    Shubham    ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['     Pooja     ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['     Parth     ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['    Paridhi    ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['    Parishi    ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['     Kirti     ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['    Yaanvi     ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['    Chirag     ', '       5       ', '       5       ', '       5       ', '      5.0      ']
['    Anurag     ', '       5      