In [1]:
import json, sqlparse
import pandas as pd

In [2]:
sql_extract_token_type = {
            sqlparse.sql.IdentifierList, sqlparse.sql.Where,
            sqlparse.sql.Having, sqlparse.sql.Comparison, sqlparse.sql.Function,
            sqlparse.sql.Parenthesis, sqlparse.sql.Operation, sqlparse.sql.Case
        }

def columns_from_query(sql_query):
    # identifiers contain table name and column name
    if type(sql_query) == str:
        sql_query = sqlparse.parse(sql_query)[0]
    columns = []
    for token in sql_query:
        if isinstance(token, sqlparse.sql.Identifier):
            columns.append(token.get_real_name().lower())
        elif hasattr(token, "tokens"):
            columns.extend(columns_from_query(token.tokens))
    return columns

def columns_by_split(sql_query:str, all_columns:list):
    columns = []
    for token in sql_query.split():
        if token[-1] == ",": token = token[:-1]
        if token in all_columns:
            columns.append(token)
    return columns

In [3]:
with open("../filtering-schema/src/schemas/column-datatypes/pointx_fbs_rpt_dly_datatype.json") as f:
    all_columns = set(json.load(f)['COLUMNS'].keys())
exp_df = pd.read_excel("../src/pointx/PointX - text2sql pair.xlsx")
exp_df.head()

Unnamed: 0,Group,Question,Description/Calculation,SQL,Remark
0,Active Users,How many daily active users each day?,Active means engagement_time_msec (by default ...,"SELECT event_date, COUNT(DISTINCT user_pseudo_...",
1,,How many monthly active users each month?,,"SELECT event_month, COUNT(DISTINCT user_pseudo...",
2,,What is the average number of daily active use...,,"SELECT AVG(cnt) FROM\n (SELECT event_date, CO...",
3,,What is the mean number of daily active users ...,,"SELECT AVG(cnt) FROM\n (SELECT event_date, CO...",
4,,When was the last time each user was active?,,"SELECT user_pseudo_id, MAX(last_active) FROM\n...",


In [6]:
used_cols = []

for i, row in exp_df.iterrows():
    try:
        used_cols.extend([c for c in columns_from_query(row['SQL']) if c in all_columns])
        used_cols.extend(columns_by_split(row['SQL'], all_columns))
    except: pass

used_cols = list(set(used_cols))
sorted(used_cols)

['device_category',
 'device_mobile_model_name',
 'engagement_time_msec',
 'event_date',
 'event_month',
 'event_name',
 'event_timestamp',
 'ga_session_id',
 'geo_country',
 'geo_region',
 'user_first_touch_timestamp',
 'user_pseudo_id']

In [8]:
pointx_rpt_dly_df = pd.read_csv("../filtering-schema/src/data/pointx_fbs_rpt_dly.csv")[used_cols]
pointx_rpt_dly_df

Unnamed: 0,user_first_touch_timestamp,event_name,ga_session_id,event_date,event_timestamp,user_pseudo_id,engagement_time_msec,geo_region,event_month,device_mobile_model_name,geo_country,device_category
0,2022-02-23T11:52:03.700+0000,pointx_payandmerchant_bottom_bar,1.653651e+09,2022-05-27,2022-05-27T18:33:43.629+0000,F51A60AC085F4416B246F636099DAB85,0,Bangkok,2022-05,iPhone 8 Plus,Thailand,mobile
1,2022-02-23T11:52:03.700+0000,mypointx_landing,1.653651e+09,2022-05-27,2022-05-27T18:34:07.435+0000,F51A60AC085F4416B246F636099DAB85,0,Bangkok,2022-05,iPhone 8 Plus,Thailand,mobile
2,2022-02-23T11:52:03.700+0000,pointx_home_bottom_bar,1.653451e+09,2022-05-27,2022-05-27T13:11:49.621+0000,F51A60AC085F4416B246F636099DAB85,0,Bangkok,2022-05,iPhone 8 Plus,Thailand,mobile
3,2022-02-23T11:52:03.700+0000,screen_view,1.653451e+09,2022-05-27,2022-05-27T13:11:59.516+0000,F51A60AC085F4416B246F636099DAB85,0,Bangkok,2022-05,iPhone 8 Plus,Thailand,mobile
4,2022-02-23T11:52:03.700+0000,mypointx_landing,1.653451e+09,2022-05-27,2022-05-27T13:12:00.358+0000,F51A60AC085F4416B246F636099DAB85,0,Bangkok,2022-05,iPhone 8 Plus,Thailand,mobile
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2022-01-21T19:30:19.329+0000,session_start,1.653901e+09,2022-05-30,2022-05-30T16:01:08.481+0000,5AF61BAC90A94DF2A25B432D527FD88E,0,Bangkok,2022-05,iPhone 13 Pro Max,Thailand,mobile
996,2022-01-21T19:30:19.329+0000,pointx_more,1.653901e+09,2022-05-30,2022-05-30T16:01:12.972+0000,5AF61BAC90A94DF2A25B432D527FD88E,0,Bangkok,2022-05,iPhone 13 Pro Max,Thailand,mobile
997,2022-01-21T19:30:19.329+0000,screen_view,1.653901e+09,2022-05-30,2022-05-30T16:01:52.408+0000,5AF61BAC90A94DF2A25B432D527FD88E,0,Bangkok,2022-05,iPhone 13 Pro Max,Thailand,mobile
998,2022-01-21T19:30:19.329+0000,delivery_addr_item,1.653901e+09,2022-05-30,2022-05-30T16:02:04.709+0000,5AF61BAC90A94DF2A25B432D527FD88E,0,Bangkok,2022-05,iPhone 13 Pro Max,Thailand,mobile
