In [2]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
# q = """
# SELECT strftime('%Y', date) AS year, SUM(beef) AS beef_total
# FROM meat
# GROUP BY year;
# """
# print(pysqldf(q).head())

In [4]:
import pandas as pd

### sample data for the practise (this is a fake dataset)

- dt: transaction date, could be string or datetime, flexible with hive sql
- oid: string, order id, primary key
- uid: customer id
- num_items: number of items purchased by customers per order
- tt_gmv: total price of each order (not per item!)

In [15]:
orders = pd.read_csv('sample_transaction.csv',dtype=str).head(19)


In [22]:
orders['tt_gmv']=orders['tt_gmv'].astype(float)
orders['num_items']=orders['num_items'].astype(int)

In [23]:
q = """
SELECT *
FROM orders
"""
print(pysqldf(q).head()) 

           dt oid   uid  num_items  tt_gmv
0  2023-01-01   1  1000          1    20.0
1  2023-01-02   2  1001          2    56.0
2  2023-02-05   3  1002          3    34.0
3  2023-06-10   4  1003          4   100.0
4  2023-07-01   5  1000          2    23.0


### rank function
rank function is one of the most frequently used window functions and row_number is the most popular one
- row_number() over(partition by xxx order by xxx)
    - 1,2,3,4
- rank() over(partition by xxx order by xxx)
    - 1,2,2,4
- dense_rank() over(partition by xxx order by xxx)
    - 1,2,2,3

Q1: for every customer (uid), get the most expensive order they have purchase, need return dt, oid, num_items as well
- highest order: max ttm

In [28]:
q = """
select *
from (SELECT *, row_number() over(partition by uid order by tt_gmv desc) as rank
FROM orders)t
where rank = 1
order by tt_gmv desc
"""
print(pysqldf(q)) 

           dt oid   uid  num_items  tt_gmv  rank
0  2023-07-15   6  1001          3   100.0     1
1  2023-06-10   4  1003          4   100.0     1
2  2023-11-01  16  1000          1    78.0     1
3  2023-07-16   7  1002          1    55.0     1


### percentile function
percentile function is widely used to select top performance targets (countries, supermarkets, customers etc)
- ntile()
- percent_rank()

Q: return top 25% orders by tt_gmv 

In [31]:
q = """
select *
from (SELECT *, ntile(4) over(order by tt_gmv desc) as percentile
FROM orders)t


"""
print(pysqldf(q))

            dt oid   uid  num_items  tt_gmv  percentile
0   2023-06-10   4  1003          4   100.0           1
1   2023-07-15   6  1001          3   100.0           1
2   2023-12-18  19  1003          5    99.0           1
3   2023-11-01  16  1000          1    78.0           1
4   2023-09-28  13  1001          5    66.0           1
5   2023-01-02   2  1001          2    56.0           2
6   2023-07-16   7  1002          1    55.0           2
7   2023-09-10  11  1002          4    45.0           2
8   2023-10-16  15  1002          3    45.0           2
9   2023-12-05  18  1000          2    45.0           2
10  2023-02-05   3  1002          3    34.0           3
11  2023-07-01   5  1000          2    23.0           3
12  2023-09-01  10  1001          3    23.0           3
13  2023-10-01  14  1002          2    23.0           3
14  2023-11-05  17  1000          1    23.0           3
15  2023-01-01   1  1000          1    20.0           4
16  2023-08-15   9  1000          2    18.0     

In [34]:
q = """
select *
from (SELECT *, ntile(4) over(order by tt_gmv desc) as percentile
FROM orders)t
where t.percentile = 1


"""
print(pysqldf(q))

           dt oid   uid  num_items  tt_gmv  percentile
0  2023-06-10   4  1003          4   100.0           1
1  2023-07-15   6  1001          3   100.0           1
2  2023-12-18  19  1003          5    99.0           1
3  2023-11-01  16  1000          1    78.0           1
4  2023-09-28  13  1001          5    66.0           1


In [33]:
q = """
select *
from (SELECT *, percent_rank() over(order by tt_gmv desc) as percentile
FROM orders)t


"""
print(pysqldf(q))

            dt oid   uid  num_items  tt_gmv  percentile
0   2023-06-10   4  1003          4   100.0    0.000000
1   2023-07-15   6  1001          3   100.0    0.000000
2   2023-12-18  19  1003          5    99.0    0.111111
3   2023-11-01  16  1000          1    78.0    0.166667
4   2023-09-28  13  1001          5    66.0    0.222222
5   2023-01-02   2  1001          2    56.0    0.277778
6   2023-07-16   7  1002          1    55.0    0.333333
7   2023-09-10  11  1002          4    45.0    0.388889
8   2023-10-16  15  1002          3    45.0    0.388889
9   2023-12-05  18  1000          2    45.0    0.388889
10  2023-02-05   3  1002          3    34.0    0.555556
11  2023-07-01   5  1000          2    23.0    0.611111
12  2023-09-01  10  1001          3    23.0    0.611111
13  2023-10-01  14  1002          2    23.0    0.611111
14  2023-11-05  17  1000          1    23.0    0.611111
15  2023-01-01   1  1000          1    20.0    0.833333
16  2023-08-15   9  1000          2    18.0    0

In [35]:
q = """
select *
from (SELECT *, percent_rank() over(order by tt_gmv desc) as percentile
FROM orders)t
where t.percentile <=0.25

"""
print(pysqldf(q))

           dt oid   uid  num_items  tt_gmv  percentile
0  2023-06-10   4  1003          4   100.0    0.000000
1  2023-07-15   6  1001          3   100.0    0.000000
2  2023-12-18  19  1003          5    99.0    0.111111
3  2023-11-01  16  1000          1    78.0    0.166667
4  2023-09-28  13  1001          5    66.0    0.222222


## lead lag function

- lead()
- lag()

Q2: use another data set for login detection, the goal is to get the user's most recent normal login frequency
- defition of nornal: consecutive is_abnormal = 0

In [5]:
login = pd.read_csv('login_detection.csv',dtype=str)
login

Unnamed: 0,timestamp,uid,is_abnormal
0,2023-11-01,123,0
1,2023-11-02,123,0
2,2023-11-03,123,1
3,2023-11-04,123,1
4,2023-11-05,123,1
5,2023-11-06,123,1
6,2023-11-07,123,1
7,2023-11-08,123,0
8,2023-11-09,123,1
9,2023-11-10,123,0


In [23]:
q = """
select timestamp,uid,is_abnormal,lead_normal,rank
from (SELECT *, 
lead(is_abnormal,1)over(partition by uid order by timestamp desc) as lead_normal,
row_number()over(partition by uid order by timestamp desc) as rank
FROM login) as a
where rank = 1

"""
print(pysqldf(q)) 

    timestamp  uid is_abnormal lead_normal  rank
0  2023-11-11  123           0           0     1


## sum over