In [6]:
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
from sentence_transformers import SentenceTransformer, util
import json, os, math, time, ast
import numpy as np
import warnings
warnings.filterwarnings('ignore')
sentenceemb_model = SentenceTransformer("filtering_schema/models/all-MiniLM-L6-v2")

In [7]:
# schema_root_path = "filtering_schema/src/schemas/coffeeshop-descriptions"
schema_root_path = "filtering_schema/src/schemas/column-descriptions"
table_description_vec = dict()
column_names = ['Table', 'Column', 'Description']
full_df = pd.DataFrame(columns=column_names)

for file in os.listdir(schema_root_path):
    schema_file_path = os.path.join(schema_root_path, file)
    with open(schema_file_path, 'r') as f:
        schema = json.load(f)

    table_description_vec[schema['table']] = sentenceemb_model.encode(schema['description'])

    temp_df = pd.DataFrame(schema['columns'].items() ,columns=['Column', 'Description'])
    temp_df['Table'] = schema['table']
    temp_df = temp_df[column_names]
    full_df = pd.concat([full_df, temp_df], ignore_index=True)

print(full_df.shape)
full_df

(425, 3)


Unnamed: 0,Table,Column,Description
0,pointx_keymatrix_dly,month_id,Transaction month
1,pointx_keymatrix_dly,ntx_pointx_financial,"All financial transactions, both Payment, Top ..."
2,pointx_keymatrix_dly,ntx_pointx_financial_out,"Number of Payment Transaction, Transfer"
3,pointx_keymatrix_dly,ncust_user,All the number of Customer in the systemAnd ca...
4,pointx_keymatrix_dly,ncust_pointx,"Point X Customers, both Customer Type, Easy an..."
...,...,...,...
420,pointx_fbs_rpt_dly,update_with_analytics,Update with analytics
421,pointx_fbs_rpt_dly,client_code,Client code
422,pointx_fbs_rpt_dly,client_member_id,Client member id
423,pointx_fbs_rpt_dly,_dl_load_ts,Date of data loading


In [8]:
# schema_classes_file_path = "src/spider/cofee_shop/coffee_shop_class.json"
schema_classes_file_path = "src/pointx/pointx_classes.json"
with open(schema_classes_file_path, 'r') as f:
    schema_classes = json.load(f)
schema_classes

{'pointx_keymatrix_dly': {'Dashboard_Metadata': "This class contains metadata about the Key Matrix Dashboard Design table. It includes essential information such as the table's name, which is 'pointx_keymatrix_dly,' and a descriptive overview of the table's purpose and structure. This class primarily serves to provide context and understanding of the data, making it useful for effective dashboard design.",
  'Transaction_Metrics': 'The Transaction_Metrics class comprises columns related to various transaction metrics. It encompasses critical data points such as transaction counts, financial transactions, top-ups, transfers, and payment-related statistics. These metrics offer insights into the volume and nature of transactions, facilitating analysis of payment patterns, financial activity, and user behavior.',
  'Customer_Metrics': 'The Customer_Metrics class focuses on columns related to customer-related metrics. It provides information about different customer types, customer counts, 

In [9]:
for table, table_class in schema_classes.items():
    for topic, info in table_class.items():
        schema_classes[table][topic] = sentenceemb_model.encode(info)


In [10]:
def most_relate_topic(text:str, table:str,topic_threshold_score:float=0.4, 
                      topic_select:bool=False, top_n:int=10, base_n:int=1):
    text_vec = sentenceemb_model.encode(text)
    topic_scores = [float(util.cos_sim(info, text_vec)) for info in schema_classes[table].values()]
    if topic_select:
        probs = (topic_scores / np.sum(topic_scores)) * top_n
        topic_selected = { key: max(base_n, math.ceil(score)) for key, score in zip(schema_classes[table].keys(), probs)}
        return topic_selected
    related_topic_indices = np.where(np.array(topic_scores) >= topic_threshold_score)[0]
    related_topics = [list(schema_classes[table].keys())[i] for i in related_topic_indices]
    # {'event_detail': 6, 'member_information': 2, 'shop_information': 4}
    return related_topics

In [11]:
full_df['Vector'] = full_df['Description'].apply(lambda x: sentenceemb_model.encode(x))
full_df['Topic'] = full_df.apply(lambda row: most_relate_topic(row['Description'], row['Table']), axis=1)
full_df.head()

Unnamed: 0,Table,Column,Description,Vector,Topic
0,pointx_keymatrix_dly,month_id,Transaction month,"[0.06046718, 0.0097362045, -0.08348081, -0.012...","[Transaction_Metrics, Date_information]"
1,pointx_keymatrix_dly,ntx_pointx_financial,"All financial transactions, both Payment, Top ...","[0.019717792, -0.03715116, -0.08518567, -0.064...",[Transaction_Metrics]
2,pointx_keymatrix_dly,ntx_pointx_financial_out,"Number of Payment Transaction, Transfer","[0.04923797, -0.011678213, -0.06361538, -0.045...",[Transaction_Metrics]
3,pointx_keymatrix_dly,ncust_user,All the number of Customer in the systemAnd ca...,"[-0.0042273775, -0.023767758, -0.041891005, -0...","[Customer_Metrics, External_Partner_Metrics]"
4,pointx_keymatrix_dly,ncust_pointx,"Point X Customers, both Customer Type, Easy an...","[-0.0097105885, -0.046169803, 0.016053455, -0....",[Customer_Metrics]


In [12]:
# 1st normalize form
expanded_df = full_df.explode('Topic')
expanded_df.head()

Unnamed: 0,Table,Column,Description,Vector,Topic
0,pointx_keymatrix_dly,month_id,Transaction month,"[0.06046718, 0.0097362045, -0.08348081, -0.012...",Transaction_Metrics
0,pointx_keymatrix_dly,month_id,Transaction month,"[0.06046718, 0.0097362045, -0.08348081, -0.012...",Date_information
1,pointx_keymatrix_dly,ntx_pointx_financial,"All financial transactions, both Payment, Top ...","[0.019717792, -0.03715116, -0.08518567, -0.064...",Transaction_Metrics
2,pointx_keymatrix_dly,ntx_pointx_financial_out,"Number of Payment Transaction, Transfer","[0.04923797, -0.011678213, -0.06361538, -0.045...",Transaction_Metrics
3,pointx_keymatrix_dly,ncust_user,All the number of Customer in the systemAnd ca...,"[-0.0042273775, -0.023767758, -0.041891005, -0...",Customer_Metrics


In [13]:
expanded_df[expanded_df['Topic'] == 'Date_information']

Unnamed: 0,Table,Column,Description,Vector,Topic
0,pointx_keymatrix_dly,month_id,Transaction month,"[0.06046718, 0.0097362045, -0.08348081, -0.012...",Date_information
166,pointx_keymatrix_dly,_dl_load_ts,Data download date,"[0.005903223, -0.0006619766, -0.040421296, -0....",Date_information
167,pointx_keymatrix_dly,_date,Transaction date,"[0.016334053, 0.032540742, -0.04418236, -0.005...",Date_information


In [14]:
# question = 'How many member paying each shop that has most happy hour'
question = "How many point transaction occur each month"
tables = ['pointx_keymatrix_dly', 'pointx_fbs_rpt_dly']

In [15]:
def table_selected(question:str,n_select:int , table_descriptions_vector:dict = table_description_vec):
    question_vector = sentenceemb_model.encode(question)
    table_scores = {table : round(float(util.cos_sim(table_vector, question_vector)),3) for table, table_vector in table_descriptions_vector.items()}
    sum_score = sum(table_scores.values())
    tab_select = {table: math.floor(n_select * score / sum_score) for table, score in table_scores.items()}
    return tab_select

table_selected(question, 20)

{'pointx_keymatrix_dly': 3, 'pointx_cust_mly': 9, 'pointx_fbs_rpt_dly': 7}

In [16]:

def selected_columns(question, specific_tables:list, max_n=20) -> dict:

    _df = expanded_df[['Table', 'Column', 'Vector', 'Topic']]
    _df = _df[_df['Table'].isin(specific_tables)]
    question_vector = sentenceemb_model.encode(question)
    _df['Score'] = _df['Vector'].apply(lambda x: float(util.cos_sim(x, question_vector)))

    # {'event_detail': 6, 'member_information': 2, 'shop_information': 4}
    topic_selected = dict()
    for table in specific_tables:
        table_select = (max_n // len(specific_tables))
        topic_selected.update(most_relate_topic(question, table, top_n=table_select, base_n=1, topic_select=True))
        
    print(topic_selected)
    used_schema = {table : dict() for table in specific_tables}
    used_cols = []

    for topic, num in topic_selected.items():
        selected_col_index = _df[_df['Topic'] == topic]['Score'].sort_values(ascending=False).head(num).index
        used_cols.extend(_df.loc[selected_col_index, 'Column'].to_list())

    used_cols = list(set(used_cols))

    for i, row in _df[_df['Column'].isin(used_cols)].iterrows():
        used_schema[row['Table']][row['Column']] = round(row['Score'],3)

    return used_schema

selected_columns(question, tables)

{'Dashboard_Metadata': 1, 'Transaction_Metrics': 3, 'Customer_Metrics': 2, 'Point_Rate_Metrics': 2, 'Date_information': 2, 'External_Partner_Metrics': 2, 'UserInteractionMetrics': 3, 'DeviceInformation': 2, 'GeoLocationMetrics': 2, 'AppInfoMetrics': 2, 'TrafficSourceMetrics': 2, 'EventDimensions': 2}


{'pointx_keymatrix_dly': {'month_id': 0.636,
  'mtd1_ncust_visit': 0.637,
  'mtd1_amt_point_topup': 0.636,
  'mtd1_amt_point_transfer_out': 0.761,
  'mtd1_amt_point_pay': 0.689,
  'mtd1_n_point_payment_p_only': 0.687,
  'mtd1_amt_point_transfer_out_extnl': 0.643,
  '_date': 0.422},
 'pointx_fbs_rpt_dly': {'event_date': 0.179,
  'event_name': 0.253,
  'user_properties_ga_session_number_set_timestamp_micros': 0.261,
  'traffic_source_name': 0.148,
  'campaign_info_source': 0.015,
  'customer_device_lat': 0.144,
  'customer_device_long': 0.112,
  'customer_lat': 0.204,
  'system_app': 0.054,
  '_date': 0.535}}

In [17]:
expanded_df[(expanded_df['Topic'] == 'Date_information') & (expanded_df['Table'] == 'pointx_keymatrix_dly')]

Unnamed: 0,Table,Column,Description,Vector,Topic
0,pointx_keymatrix_dly,month_id,Transaction month,"[0.06046718, 0.0097362045, -0.08348081, -0.012...",Date_information
166,pointx_keymatrix_dly,_dl_load_ts,Data download date,"[0.005903223, -0.0006619766, -0.040421296, -0....",Date_information
167,pointx_keymatrix_dly,_date,Transaction date,"[0.016334053, 0.032540742, -0.04418236, -0.005...",Date_information


In [18]:
exp_df = pd.read_excel('src/pointx/SchemaLink-Experiment.xlsx', sheet_name='LLM result - full pipeline')[['Table', 'Question', 'Actual result']]
exp_df

Unnamed: 0,Table,Question,Actual result
0,pointx_keymatrix_dly,What is the total number of all financial tran...,"['month_id', 'ntx_pointx_financial']"
1,pointx_keymatrix_dly,What is the total amount of points generated b...,"['month_id', 'amt_point_topup']"
2,pointx_keymatrix_dly,What is the total amount of points generated b...,"['month_id', 'amt_point_pay']"
3,pointx_keymatrix_dly,What is the average rate of released points fo...,['rate_point_per_baht_pay']
4,pointx_keymatrix_dly,Can you determine the average number of custom...,"['month_id', 'ncust_visit']"
...,...,...,...
100,pointx_fbs_rpt_dly,What's the average sessions duration for users...,"['event_date', 'event_timestamp', 'event_name'..."
101,pointx_fbs_rpt_dly,What's the percentage of users who have used t...,"['event_date', 'user_pseudo_id']"
102,pointx_fbs_rpt_dly,How many users have performed a specific event...,"['event_date', 'user_pseudo_id']"
103,pointx_fbs_rpt_dly,How many users have opened the app on at least...,"['user_pseudo_id', 'device_category']"


In [19]:
def safe_divide(numerator, denominator):
    try:
        result = round(numerator / denominator,2)
    except :
        result = "ZeroDivisionError"
    return result

In [20]:
def f1_columns(actual_columns, result_columns):
    col_TP = len(set(actual_columns) & set(result_columns))
    col_FP = len(set(result_columns) - set(actual_columns))
    col_FN = len(set(actual_columns) - set(result_columns))

    if col_TP is not None and col_FN is not None and col_FP is not None:
        if len(actual_columns) == 0 : col_recall = 1
        else: col_recall = safe_divide(col_TP, col_TP + col_FN)
        col_precision = safe_divide(col_TP, col_TP + col_FP)
        
    if col_precision is not None and col_recall is not None and col_recall != "ZeroDivisionError" and col_precision != "ZeroDivisionError":
        col_f1 = 2 * safe_divide(col_precision * col_recall, col_precision + col_recall)
        if type(col_f1) == str: col_f1 = "ZeroDivisionError"
    else: col_f1 = "ERROR"
    
    return col_recall, col_precision, col_f1

In [21]:
measurement_data = {
    "Question" : [],
    "Schemalink result" : [],
    "SL recall" : [],
    "SL precision" : [],
    "SL f1" : []
}
for i, row in exp_df.iterrows():
    
    start_time = time.time()

    actual_columns = ast.literal_eval(row['Actual result'])
    module_result = list(selected_columns(row['Question'], [row['Table']], max_n=20)[row['Table']].keys())

    module_recall, module_precision, module_f1 = f1_columns(actual_columns, module_result)

    measurement_data['Question'].append(row['Question'])
    measurement_data["Schemalink result"].append(module_result)
    measurement_data["SL recall"].append(module_recall)
    measurement_data["SL precision"].append(module_precision)
    measurement_data["SL f1"].append(module_f1)

    print(row['Question'])
    print(actual_columns)
    print(module_result)
    print(module_recall, module_precision, module_f1)
    print('Escape time:', time.time() - start_time)
    print()
    
measurement_df = pd.DataFrame(measurement_data)
merged_df = pd.merge(exp_df, measurement_df, on='Question', how='outer')

# merged_df.to_excel('EXP_temp.xlsx', index=False)

{'Dashboard_Metadata': 2, 'Transaction_Metrics': 7, 'Customer_Metrics': 3, 'Point_Rate_Metrics': 3, 'Date_information': 5, 'External_Partner_Metrics': 3}
What is the total number of all financial transactions for each month?
['month_id', 'ntx_pointx_financial']
['month_id', 'ntx_pointx_financial', 'mtd1_ncust_visit', 'mtd1_ncust_pointx_financial', 'mtd1_n_topup_point', 'mtd1_n_transfer_point_out', 'mtd1_n_purchase', 'mtd1_n_purchase_qr', 'mtd1_n_purchase_pyw', 'mtd1_n_point_payment_p_only', 'mtd1_n_point_payment_p_casa', 'mtd1_n_point_payment_p_cc', 'mtd1_n_topup_point_extnl', 'n_transfer_point_out_extnl', 'mtd1_n_transfer_point_out_extnl', 'mtd1_amt_point_transfer_out_extnl', '_dl_load_ts', '_date']
1.0 0.11 0.2
Escape time: 0.05218982696533203

{'Dashboard_Metadata': 2, 'Transaction_Metrics': 5, 'Customer_Metrics': 3, 'Point_Rate_Metrics': 4, 'Date_information': 4, 'External_Partner_Metrics': 5}
What is the total amount of points generated by all top-up transactions in August 2022?
[