In [228]:
import pandas as pd
import numpy as np
import sqlparse, time
from sentence_transformers import SentenceTransformer, util

desc_col = {}

description_df = pd.read_excel('src/New_query_Description.xlsx',header=1)
description_df = description_df[['Column','Description']].dropna().reset_index(drop=True)
x = 0
for i, row in description_df.iterrows():
    desc_col[x] = row['Column']
    x += 1

In [229]:
descriptions = description_df['Description'].tolist()

In [230]:
pointx_cols = pd.read_csv('src/pointx_fbs_rpt_dly.csv').columns.to_list()
assert len(pointx_cols) == len(desc_col), "Length of columns in Pointx table and Descriptions are not equal"

In [231]:
def get_col(sql):
    col = []
    ignore = ['over','extract','desc','datediff','dayofweek','cnt','dateadd','max','min','sum','count','getdate','timestampdiff','weekday','having','month','year','day','date','avg','team_tds.tds_intern.pointx_fbs_txn_rpt_dly']
    for token in sql.tokens:
        # print(type(token),token)
        if str(token).lower() in ignore:
            continue
        if isinstance(token, sqlparse.sql.Identifier):
            if len(str(token).lower().split('as')) > 1:
                col += get_col(token)
            elif '"' in str(token): #ignore condition value
                continue
            else:
                col.append(str(token))
        elif isinstance(token, sqlparse.sql.IdentifierList) or isinstance(token, sqlparse.sql.Where) or isinstance(token, sqlparse.sql.Having) or isinstance(token, sqlparse.sql.Comparison) or isinstance(token, sqlparse.sql.Function) or isinstance(token, sqlparse.sql.Parenthesis) or isinstance(token,sqlparse.sql.Operation):
            col += get_col(token)
        if str(token).lower() not in pointx_cols:
            continue
            
    return col

In [232]:
compare_df = pd.read_csv('src/compare_result.csv')
questions = compare_df['Question'].to_list()
sql_queries = compare_df['SQL'].to_list()

In [236]:
model = SentenceTransformer('all-MiniLM-L6-v2')
description_embs = [model.encode(des) for des in descriptions]

def get_col_max_score(question,col_labels,description_embs=description_embs,description_df=description_df):
    q_emb = model.encode(question)
    scores = np.array([float(util.cos_sim(q_emb, des)) for des in description_embs])
    col_labels_index = description_df[description_df['Column'].isin(col_labels)].index.tolist()
    col_labels_score = scores[col_labels_index]
    min_threshold = np.min(col_labels_score)

    print("QUESTION:\t",question)
    print("EXPECT COLUMNS:\t",col_labels)
    print("MIN THRESHOLD:\t",min_threshold)
    print("MAX THRESHOLD:\t",np.max(col_labels_score))
    print("MAX SCORE COLUMN:\t",description_df.iloc[np.argmax(scores)]['Column'])
    print("DESCIPTION:\t",description_df.iloc[np.argmax(scores)]['Description'])
    print("SCORE:\t",np.max(scores))

    n_columns = len(description_df.iloc[np.where(scores > min_threshold)])
    print(f"CHOOSE RELATE COLUMN WITHIN THRESHOLD (FROM {len(scores)} COLUMNS):",n_columns)
    print()
    
    return n_columns

In [238]:
n_cols = 0
start_time = time.time()
for i, q in enumerate(questions):
    sql_parse = sqlparse.parse(sql_queries[i])[0]
    col_labels = list(set(get_col(sql_parse)))
    n_cols += get_col_max_score(q,col_labels)

print("AVERAGE NUMBER OF COLUMNS:\t",n_cols/len(questions))
print("AVG TIME PER QUESTION:\t",(time.time()-start_time)/len(questions))

QUESTION:	 How many daily active users each day?
EXPECT COLUMNS:	 ['event_date', 'engagement_time_msec', 'user_pseudo_id', 'ga_session_id']
MIN THRESHOLD:	 0.15984684228897095
MAX THRESHOLD:	 0.23008966445922852
MAX SCORE COLUMN:	 deal_type
DESCIPTION:	 Deal of the day type
SCORE:	 0.39395755529403687
CHOOSE RELATE COLUMN WITHIN THRESHOLD (FROM 182 COLUMNS): 48

QUESTION:	 How many monthly active users each month?
EXPECT COLUMNS:	 ['engagement_time_msec', 'user_pseudo_id', 'event_month', 'ga_session_id']
MIN THRESHOLD:	 0.1179906576871872
MAX THRESHOLD:	 0.31292176246643066
MAX SCORE COLUMN:	 privacy_info_ads_storage
DESCIPTION:	 Whether ad targeting is enabled for a user.
SCORE:	 0.3766669034957886
CHOOSE RELATE COLUMN WITHIN THRESHOLD (FROM 182 COLUMNS): 67

QUESTION:	 What is the average number of daily active users last 7 days?
EXPECT COLUMNS:	 ['event_date', 'engagement_time_msec', 'user_pseudo_id', 'ga_session_id']
MIN THRESHOLD:	 0.11249055713415146
MAX THRESHOLD:	 0.23839810490

In [None]:
np.where(np.array(questions) == "What's the average sessions duration for users each day?")

(array([62]),)

In [None]:
for token in sqlparse.parse(s)[0].tokens:
    print(type(token),token)
    if isinstance(token,sqlparse.sql.IdentifierList):
        print(token.tokens)

In [None]:
s = """SELECT AVG(cnt) FROM
  (SELECT event_date, COUNT(DISTINCT user_pseudo_id) as cnt
  FROM (
      SELECT event_date, ga_session_id, user_pseudo_id
      FROM team_tds.tds_intern.pointx_fbs_txn_rpt_dly
      GROUP BY event_date, ga_session_id, user_pseudo_id
      HAVING SUM(engagement_time_msec) > 10*1000
  )
  GROUP BY event_date
  ORDER BY event_date ASC)
WHERE DAYOFWEEK(event_date) IN (1, 7)"""