In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/online-shop-2024/products.csv
/kaggle/input/online-shop-2024/suppliers.csv
/kaggle/input/online-shop-2024/shipments.csv
/kaggle/input/online-shop-2024/order_items.csv
/kaggle/input/online-shop-2024/reviews.csv
/kaggle/input/online-shop-2024/payment.csv
/kaggle/input/online-shop-2024/orders.csv
/kaggle/input/online-shop-2024/customers.csv


As stated in the <a href='https://www.kaggle.com/datasets/marthadimgba/online-shop-2024'>dataset page</a> "The structure of the online_shop dataset consists of interconnected tables that simulate a real-world e-commerce platform. Each table represents a key aspect of the business, such as products, orders, customers, suppliers, and reviews."</br>
In this analysis we'll use this dataset to study sales performances and to build a basic product recommender.

In [2]:
import sqlite3

<h2>Creating the database</h2>

Being the different tables related to each other, we load the csv file in a relational database:

In [3]:
products_df = pd.read_csv('/kaggle/input/online-shop-2024/products.csv')
suppliers_df = pd.read_csv('/kaggle/input/online-shop-2024/suppliers.csv')
shipments_df = pd.read_csv('/kaggle/input/online-shop-2024/shipments.csv', parse_dates = ['shipment_date', 'delivery_date'])
order_items_df = pd.read_csv('/kaggle/input/online-shop-2024/order_items.csv')
payment_df = pd.read_csv('/kaggle/input/online-shop-2024/payment.csv')#, parse_dates = ['payment_date'])
reviews_df = pd.read_csv('/kaggle/input/online-shop-2024/reviews.csv', parse_dates = ['review_date'])
orders_df = pd.read_csv('/kaggle/input/online-shop-2024/orders.csv', parse_dates = ['order_date'])
customers_df = pd.read_csv('/kaggle/input/online-shop-2024/customers.csv')

In [4]:
conn = sqlite3.connect('online_shop.db')
products_df.to_sql('products', con = conn, if_exists = 'replace', index = False)
suppliers_df.to_sql('suppliers', con = conn, if_exists = 'replace', index = False)
shipments_df.to_sql('shipments', con = conn, if_exists = 'replace', index = False)
order_items_df.to_sql('order_items', con = conn, if_exists = 'replace', index = False)
payment_df.to_sql('payments', con = conn, if_exists = 'replace', index = False)
reviews_df.to_sql('reviews', con = conn, if_exists = 'replace', index = False)
orders_df.to_sql('orders', con = conn, if_exists = 'replace', index = False)
customers_df.to_sql('customers', con = conn, if_exists = 'replace', index = False)

TABLES = ['products', 'suppliers', 'shipments', 'order_items', 'payments', 'reviews', 'orders', 'customers']

We also define two functions for speeding up querying/formatting:

In [5]:
def print_table(cnames, rows, max_columns = 7, max_rows = 20):
    """
    Print the result of a (SELECT) query in a table format with at most max_columns columns and max_rows rows
    """
    _cnames = cnames[:]
    _rows = rows[:]
    if len(cnames) > max_columns:
        _cnames = _cnames[:max_columns] + ['...']
        _rows = [row[:max_columns] + ['...'] for row in rows]
    print('|'.join(f'{cname:^15}' for cname in _cnames) + '\n')
    print('-' * 15 * len(_cnames))
    for row in _rows[:max_rows]:
        print('|'.join(f'{item:^15}' for item in row) + '\n')

def do_query(sql, connection = conn):
    """
    Use a cursor to execute a SQL query and print the results with the print_table function
    """
    cur = conn.cursor()
    cur.execute(sql)
    rows = cur.fetchall()
    cnames = list(map(lambda x : x[0], cur.description))
    print_table(cnames, rows)
    cur.close()

<h3>View tables</h3>

We start by simply looking at the first few rows of each table:

In [6]:
for table in TABLES:
    sql_view = f"""
    SELECT * FROM {table} LIMIT 5;
    """
    print(table.upper() + '\n')
    cur = conn.cursor()
    cur.execute(sql_view)
    rows = cur.fetchall()
    cnames = list(map(lambda x : x[0], cur.description))
    print_table(cnames, rows)
    print('\n')

PRODUCTS

  product_id   | product_name  |   category    |     price     |  supplier_id  

---------------------------------------------------------------------------
       1       | Office Chair  |   Furniture   |    445.01     |      501      

       2       | Coffee Maker  |Home & Kitchen |    937.29     |      502      

       3       |Document Scanner|  Electronics  |    940.02     |      503      

       4       |   Desk Mat    |  Accessories  |     76.11     |      504      

       5       | Tablet Stand  |  Accessories  |    388.17     |      505      



SUPPLIERS

  supplier_id  | supplier_name | contact_name  |    address    | phone_number  |     email     

------------------------------------------------------------------------------------------
      501      |Dynamic Systems Group|Donald Benjamin|141 Shore Ln, Island City, MA|(555) 484-6922 |dbenjamin@supplier.com

      502      |Dynamic Systems Group|Nicholas Dennis|161 Harbor Ln, Bay Point, FM|(555) 397-6986 |nde

<h2>Sales Analysis</h2>

<h3>Top 10 most purchased products</h3>

We're interested to know which products are the most popular, the products names are in the products table while the quantity purchased in different orders is stated in the order_items table:

In [7]:
sql_top_10 = """
SELECT p.product_name AS Product, 
p.category AS Category,
SUM(oi.quantity) AS Quantity
FROM products AS p 
JOIN order_items AS oi ON p.product_id = oi.product_id
GROUP BY p.product_name ORDER BY Quantity DESC LIMIT 10
"""

cur = conn.cursor()
cur.execute(sql_top_10)
rows = cur.fetchall()
cnames = list(map(lambda x : x[0], cur.description))
print_table(cnames, rows)
cur.close()

    Product    |   Category    |   Quantity    

---------------------------------------------
  4K Monitor   |  Electronics  |     2280      

 Throw Pillows |Home & Kitchen |     2072      

 Air Purifier  |Home & Kitchen |     2031      

 Standing Desk |   Furniture   |     2030      

Kitchen Blender|Home & Kitchen |     2024      

Bluetooth Headphones|  Electronics  |     2021      

 Storage Shelf |   Furniture   |     2007      

 External SSD  |  Electronics  |     2003      

  Microphone   |  Electronics  |     2001      

 Monitor Stand |   Furniture   |     1972      



From this results we note how electionics products are very popular, followed by varied home items and "electonics-related" forniture (such as standing desk and monitor stands).

<h3>Monthly gross sales</h3>

We check the gross sales values for each month:

In [8]:
sql_monthly = """
SELECT SUBSTR(orders.order_date, 1, 4) AS Year,
SUBSTR(orders.order_date, 6, 2) AS Month,
ROUND(SUM(payments.amount), 2) as 'Gross Sales'
FROM orders JOIN payments ON orders.order_id = payments.order_id
GROUP BY Year, Month
ORDER BY Year, Month ASC
"""

do_query(sql_monthly)

     Year      |     Month     |  Gross Sales  

---------------------------------------------
     2023      |      11       |   346991.19   

     2023      |      12       |   441734.7    

     2024      |      01       |   459458.14   

     2024      |      02       |   408462.62   

     2024      |      03       |   475475.15   

     2024      |      04       |   442748.32   

     2024      |      05       |   445395.85   

     2024      |      06       |   432774.14   

     2024      |      07       |   492759.03   

     2024      |      08       |   471184.74   

     2024      |      09       |   447226.92   

     2024      |      10       |   469681.23   

     2024      |      11       |    50757.7    



The sudden drop from November 2024 is simply due to partial records, the months of highest sales were July and March 2024.

<h3>Top/Worst 10 product by review</h3>

We compute the average rating for each product (the rating score goes from 1 to 5 where 1 is worst, 5 is best), we then show the 10 best rated products:

In [9]:
sql_top_rating = """
WITH reviews_avg AS
(SELECT
product_id, 
ROUND(AVG(rating), 2) as average_rating
FROM reviews
GROUP BY product_id)
SELECT products.product_name AS Product,
reviews_avg.average_rating AS Avg_Rating
FROM products JOIN reviews_avg ON products.product_id = reviews_avg.product_id
GROUP BY products.product_name
ORDER BY Avg_Rating DESC
LIMIT 10
"""

do_query(sql_top_rating)

    Product    |  Avg_Rating   

------------------------------
 Throw Pillows |      5.0      

 Storage Shelf |      5.0      

  Smart Watch  |      5.0      

Screen Protector|      5.0      

Portable Charger|      5.0      

  Phone Grip   |      5.0      

 Laptop Sleeve |      5.0      

Food Processor |      5.0      

Electric Kettle|      5.0      

Wireless Charger|      4.0      



To find the worst rated product the quering logic is the name; it's worth noting how "bluethooth headphones" is in the list while also being one of the most purchased product:

In [10]:
sql_worst_rating = """
WITH reviews_avg AS
(SELECT
product_id, 
ROUND(AVG(rating), 2) as average_rating
FROM reviews
GROUP BY product_id)
SELECT products.product_name AS Product,
reviews_avg.average_rating AS Avg_Rating
FROM products JOIN reviews_avg ON products.product_id = reviews_avg.product_id
GROUP BY products.product_name
ORDER BY Avg_Rating ASC
LIMIT 10
"""

do_query(sql_worst_rating)

    Product    |  Avg_Rating   

------------------------------
Bluetooth Headphones|      1.0      

   Bookshelf   |      1.0      

Cable Organizer|      1.0      

 Coffee Maker  |      1.0      

   Desk Mat    |      1.0      

Document Scanner|      1.0      

  Drawer Unit  |      1.0      

Noise Cancelling Headset|      1.0      

Storage Containers|      1.0      

 Tablet Stand  |      1.0      



<h2>Basic Recommender</h2>

Finally we use the sales information to build a simple products recommender: when a customer buys a product we'll suggest them the most purchased products by the other customer who also bought the product.</br>
To make a more refined recommender, we could suggest popular products that have been purchased in the same order, but as we see below each order in this database contains only one product ID:

In [11]:
sql_order_size = """
SELECT
order_id, 
COUNT(DISTINCT product_id) AS Order_Size 
FROM order_items
GROUP BY order_id
ORDER BY Order_Size DESC
LIMIT 5
"""
do_query(sql_order_size)

   order_id    |  Order_Size   

------------------------------
     15000     |       1       

     14999     |       1       

     14998     |       1       

     14997     |       1       

     14996     |       1       



For convenience we first create a temporary view with the customer ID and the product name for each order, which are the only data we need:

In [12]:
sql_view = """
DROP VIEW IF EXISTS customers_products;
CREATE TEMP VIEW IF NOT EXISTS customers_products AS
SELECT products.product_name,
customers.customer_id
FROM customers JOIN orders ON customers.customer_id = orders.customer_id
JOIN order_items on orders.order_id = order_items.order_id
JOIN products ON order_items.product_id = products.product_id;
"""

cur = conn.cursor()
cur.executescript(sql_view)
cur.close()

In [13]:
sql_select_view = """
SELECT * 
FROM customers_products
LIMIT 20
"""

do_query(sql_select_view)

 product_name  |  customer_id  

------------------------------
  Smart Watch  |       1       

  Smart Watch  |       1       

  Microphone   |       1       

  Microphone   |       1       

Bluetooth Headphones|       2       

Storage Containers|       3       

Storage Containers|       3       

Cable Organizer|       4       

Cable Organizer|       4       

Screen Protector|       5       

   Bookshelf   |       5       

  Smart Watch  |       6       

  Smart Watch  |       6       

   USB-C Hub   |       6       

 Throw Pillows |       7       

 Office Chair  |       7       

 Office Chair  |       7       

 Monitor Stand |       8       

Screen Protector|       9       

Screen Protector|      10       



For the recommender we then write a simple function that takes as input a product name and:
<ul>
    <li>Find the customer ID of all the customers who purchased that product</li>
    <li>Select all the product names from these customers (excluded the input) and how many time these name appears</li>
    <li>Order the product names by appearances and select the top 3 most popular products.</li>
</ul>

In [14]:
def suggest_product(product_name):
    sql_find_view = """
SELECT product_name, COUNT(product_name) AS product_count
FROM customers_products
WHERE (customer_id IN (SELECT customer_id FROM customers_products WHERE lower(product_name) = ?)) AND (lower(product_name) != ?)
GROUP BY product_name
ORDER BY product_count DESC
LIMIT 3
"""
    cur = conn.cursor()
    cur.execute(sql_find_view, (product_name.lower(), product_name.lower()))
    suggested_products = [row[0] for row in cur.fetchall()]
    print(f'People who purchased {product_name} also bought:', *suggested_products, sep='\n')
    cur.close()

suggest_product('smart watch')

People who purchased smart watch also bought:
4K Monitor
Wireless Mouse
Desk Mat
