In [1]:
!pip install duckdb
import pandas as pd
import duckdb

customers = pd.read_csv('customers.csv')
products = pd.read_csv('products.csv')
stores = pd.read_csv('store.csv')
transactions = pd.read_csv('transactions.csv')

# Quick look at each
print(customers.shape, products.shape, stores.shape, transactions.shape)

(200, 7) (50, 6) (5, 4) (5000, 8)


In [2]:
customers.head()

Unnamed: 0,CustomerID,FirstName,LastName,Gender,BirthDate,City,JoinDate
0,C001,Michael,Davis,M,1996-09-11,Osborneport,2022-09-25
1,C002,Michael,Miller,M,1959-08-18,New Gabrielleport,2020-11-03
2,C003,Carol,Hays,F,2005-04-19,Port Allen,2024-02-12
3,C004,Joseph,Ward,M,1992-06-16,East Edgarborough,2024-09-09
4,C005,Jamie,Salinas,M,1992-06-18,Port Kimberly,2022-02-24


In [3]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,SubCategory,UnitPrice,CostPrice
0,P001,Like Camera,Electronics,Camera,1673.69,1323.38
1,P002,Audience Television,Electronics,Television,818.76,527.62
2,P003,Here Footwear,Fashion,Footwear,337.63,169.03
3,P004,Four Accessories,Fashion,Accessories,1853.77,1363.52
4,P005,Knowledge Bags,Fashion,Bags,1396.22,1004.09


In [4]:
stores.head()

Unnamed: 0,StoreID,StoreName,City,Region
0,S001,MegaMart Jimenezborough,Jimenezborough,South
1,S002,MegaMart Peckmouth,Peckmouth,East
2,S003,MegaMart New Michele,New Michele,West
3,S004,MegaMart Brianahaven,Brianahaven,North
4,S005,MegaMart Johnmouth,Johnmouth,East


In [5]:
transactions.head()

Unnamed: 0,TransactionID,Date,CustomerID,ProductID,StoreID,Quantity,Discount,PaymentMethod
0,T00001,2024-06-18,C160,P014,S003,1,0.1,Bank Transfer
1,T00002,2023-11-02,C171,P030,S004,3,0.15,Bank Transfer
2,T00003,2024-03-28,C142,P002,S002,2,0.15,Mobile Money
3,T00004,2024-06-15,C174,P050,S002,5,0.1,Mobile Money
4,T00005,2024-08-29,C141,P036,S001,3,0.1,Credit Card


In [6]:
# Connecting to DuckDB in-memory
con = duckdb.connect()

In [15]:
# Registering DataFrames as DuckDB tables
con.register('customers', customers)
con.register('products', products)
con.register('stores', stores)
con.register('transactions', transactions)

<_duckdb.DuckDBPyConnection at 0x157ccc186b0>

In [18]:
# check
print(con.execute("DESCRIBE transactions").df())
print(con.execute("DESCRIBE customers").df())

     column_name column_type null   key default extra
0  TransactionID     VARCHAR  YES  None    None  None
1           Date     VARCHAR  YES  None    None  None
2     CustomerID     VARCHAR  YES  None    None  None
3      ProductID     VARCHAR  YES  None    None  None
4        StoreID     VARCHAR  YES  None    None  None
5       Quantity      BIGINT  YES  None    None  None
6       Discount      DOUBLE  YES  None    None  None
7  PaymentMethod     VARCHAR  YES  None    None  None
  column_name column_type null   key default extra
0  CustomerID     VARCHAR  YES  None    None  None
1   FirstName     VARCHAR  YES  None    None  None
2    LastName     VARCHAR  YES  None    None  None
3      Gender     VARCHAR  YES  None    None  None
4   BirthDate     VARCHAR  YES  None    None  None
5        City     VARCHAR  YES  None    None  None
6    JoinDate     VARCHAR  YES  None    None  None


In [8]:
# Customers who joined in 2024

duckdb.query("""
SELECT *
FROM customers
WHERE EXTRACT(YEAR FROM CAST(JoinDate AS DATE)) = 2024
LIMIT 5
""").df()

Unnamed: 0,CustomerID,FirstName,LastName,Gender,BirthDate,City,JoinDate
0,C003,Carol,Hays,F,2005-04-19,Port Allen,2024-02-12
1,C004,Joseph,Ward,M,1992-06-16,East Edgarborough,2024-09-09
2,C017,Vanessa,Anderson,F,1982-03-11,Peterside,2024-09-25
3,C018,Breanna,Davis,M,1994-08-19,East William,2024-11-12
4,C020,Ryan,Cole,F,1982-06-26,Travisstad,2024-04-19


In [9]:
# Total Number of Male and Female Customers

duckdb.query("""
select
case
when gender = 'M' then 'MALE'
when gender = 'F' then 'FEMALE'
end as Gender,
count(*) as Count
from customers
group by 1
order by Count DESC
""").df()

Unnamed: 0,Gender,Count
0,MALE,113
1,FEMALE,87


In [17]:
# Customers who have spent more than $10,000 in total

duckdb.query("""
SELECT 
    c.CustomerID,
    c.FirstName || ' ' || c.LastName AS FullName,
    SUM((p.UnitPrice * t.Quantity) - t.Discount) AS TotalSpent
FROM customers c
JOIN transactions t ON c.CustomerID = t.CustomerID
JOIN products p ON t.ProductID = p.ProductID
GROUP BY c.CustomerID, FullName
HAVING TotalSpent > 10000
ORDER BY TotalSpent DESC
""").df()

Unnamed: 0,CustomerID,FullName,TotalSpent
0,C012,Dale Perry,127918.10
1,C110,Travis Peters,122452.34
2,C085,Juan Ramirez,121536.97
3,C186,Vicki Guzman,110839.19
4,C168,Richard Jones,110602.99
...,...,...,...
195,C160,Meagan Macdonald,47580.23
196,C114,Miranda Rodriguez,45023.19
197,C011,Haley Williams,43879.12
198,C015,Matthew Molina,41851.15


In [21]:
# Find customers who joined in the last 90 days and already made at least 1 purchase

duckdb.query("""
Select c.FirstName || ' ' || c.LastName AS FullName,
c.JoinDate, count(TransactionID) as "Total Transactions"
from customers c
join transactions t
on c.CustomerID = t.CustomerID
where CAST(c.JoinDate as DATE) > CURRENT_DATE() - INTERVAL 90 DAY
group by FullName,JoinDate
""").df()

Unnamed: 0,FullName,JoinDate,Total Transactions
0,Ashley Gibson,2025-08-26,23
1,Jamie Webb,2025-07-31,32
2,Catherine Rice,2025-07-23,23
3,Dennis Wallace,2025-08-19,26
4,Christina Dominguez,2025-07-28,31
5,Lisa Ruiz,2025-08-22,16
6,Erin Rogers,2025-08-20,30
7,Miranda Rodriguez,2025-08-23,15
8,Amber Graham,2025-08-20,26
9,Scott Howell,2025-08-28,21


In [39]:
# Customers haven’t made any transactions in the last 3 months

duckdb.query("""
Select c.FirstName || ' ' || c.LastName AS FullName,
c.JoinDate, count(t.transactionID) AS NumofTransactions
from customers c
left join transactions t
on c.CustomerID = t.CustomerID
where CAST(c.JoinDate as DATE) < CURRENT_DATE() - INTERVAL 180 DAY
group by FullName,c.JoinDate
having count(t.transactionID) is NULL
order by c.JoinDate DESC
""").df()

Unnamed: 0,FullName,JoinDate,NumofTransactions


In [40]:
# City having the most customers

duckdb.query("""
select count(customerID) as totalcustomers, City
from customers
group by city
order by totalcustomers desc
""").df()

Unnamed: 0,totalcustomers,City
0,1,Leburgh
1,1,Juliefurt
2,1,Bradyshire
3,1,Shepherdburgh
4,1,New David
...,...,...
195,1,Port Jacob
196,1,West Amymouth
197,1,Port Bryce
198,1,East Rodney


In [41]:
# Customers visits by city

duckdb.query("""
select s.city, count(c.customerID) as customersvisit
from stores s
join transactions as t
on t.storeId = s.storeid
join customers as c
on c.customerID = t.customerID
group by s.city
order by customersvisit desc
""").df()

Unnamed: 0,City,customersvisit
0,Jimenezborough,1014
1,New Michele,1013
2,Brianahaven,1011
3,Peckmouth,1009
4,Johnmouth,953


In [48]:
# Customer Demographics : Average age of customers buying each category

duckdb.query("""
select p.category,
avg(DATE_DIFF('year',CAST(c.BirthDate AS DATE),CURRENT_DATE)) as AverageAge
from transactions t
join products p on t.productid = p.productid
join customers c on t.customerid = c.customerid
group by p.category
order by AverageAge DESC
""").df()

Unnamed: 0,Category,AverageAge
0,Groceries,45.220624
1,Fashion,45.161099
2,Electronics,43.988895
