In [None]:
import pandas as pd
from googletrans import Translator
from tqdm import tqdm
from dotenv import load_dotenv
from pathlib import Path
import json, os

translator = Translator()
folder_path = "src/pointx"
change_type = { "string" : "text",
                "int" : "number",
                "bigint": "number",
                "decimal(27,2)" : "number",
                "double" : "number",
                "timestamp" : "text",
                "date" : "text"
}
schema_desc_path = os.path.join(folder_path,"ETL Mapping & Data Dict - PointX (1).xlsx")
# dotenv_path = Path('.env')
# load_dotenv(dotenv_path=dotenv_path)

# Preparing Tables

## pointx_keymatrix_dly	Table

In [None]:
df = pd.read_excel(schema_desc_path, sheet_name='14')
df.columns = df.iloc[17,:]
df = df.iloc[18:,:].reset_index(drop=True)
df.columns.name = None
df.head()

In [None]:
col_types = {}
col_descs = {}
table_name = df['Table'].unique().tolist()[0]
table_desc = """The Key Matrix Dashboard Design table provides a detailed overview of dashboard-related database columns, 
including data types, status indicators, descriptions, conditions, business logic, and sample data, 
enabling a comprehensive understanding of the data structure for effective dashboard design."""

for i, row in tqdm(df.iterrows()):
    col_name = row['Column']
    data_type = change_type[row['Data Type'].lower()]
    desc = translator.translate(row['Description'], dest='en').text

    col_types[col_name] = data_type
    col_descs[col_name] = desc


In [None]:
schema_desc = {
    "table": table_name,
    "description": table_desc,
    "columns": col_descs
}

# with open(os.path.join(folder_path, "pointx_keymatrix_dly_schema_description.json"),'w') as f:
#     json.dump(schema_desc, f, indent=4)

# with open(os.path.join(folder_path, "pointx_keymatrix_dly_columns_type.json"),'w') as f:
#     json.dump(col_types, f, indent=4)

In [None]:
# with open(os.path.join(folder_path, "pointx_keymatrix_dly_schema_description.json"),'r') as f:
#     col_descs = json.load(f)

## pointx_cust_mly Table

In [None]:
with open("src/pointx/schema/pointx_cust_mly_type.json") as f:
    col_type = json.load(f)
col_names = set(col_type.keys())

In [None]:
df = pd.read_excel("src/pointx/Business Glossary 1.xlsx")
df = df[['col_name', 'descriptions']]
df = df[df.applymap(lambda x: isinstance(x, str) and x.strip() != '')].dropna()
df['descriptions'] = df['descriptions'].apply(lambda desc : translator.translate(desc, dest='en').text)


In [None]:
table_name = "pointx_cust_mly"
table_desc = """The table provides a comprehensive monthly overview of customer engagement within the app, 
capturing data related to accumulated points, usage patterns, and relevant metrics, 
facilitating in-depth analysis of user behavior and app performance."""

In [None]:
col_descs = df.set_index('col_name')['descriptions'].to_dict()
for col in col_descs:
    if col not in col_descs:
        del col_descs[col]

schema_desc = {
    "table": table_name,
    "description": table_desc,
    "columns": col_descs
}
with open(os.path.join(folder_path, "schema/pointx_cust_mly_schema_description.json"),'w') as f:
    json.dump(schema_desc, f, indent=4)

## pointx_fbs_rpt_dly Table

In [None]:
table_name = "pointx_fbs_rpt_dly"
table_desc = """Table records user interactions with the PointX app daily, capturing events such as app opens and deletions, 
providing key insights into user behavior, app version usage, and device characteristics """

df = pd.read_csv("src/pointx/pointx_fbs_rpt_dly_description.csv")
col_descs = df.set_index('Column')['Description'].to_dict()

In [None]:
schema_desc = {
    "table": table_name,
    "description": table_desc,
    "columns": col_descs
}

# with open(os.path.join(folder_path, "pointx_fbs_rpt_dly_description.json"),'w') as f:
#     json.dump(schema_desc, f, indent=4)

# Let's filtering

In [1]:
import os, json
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer
# from sql_metadata import Parser

model = SentenceTransformer('models/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
lemmanizer = WordNetLemmatizer()

table_desc_vectors = {}     # { table1: vector , ...}
schema_desc_vectors = {}    # { table1: { column1: vector, ...}}
schema_datatypes = {}       # { table1: { column1: datatype, ...}}

In [2]:
def join_schema(schema_description_path:str, schema_datatype_path:str):
    with open(schema_description_path) as jsonfile:
        new_schema_description = json.load(jsonfile)
    with open(schema_datatype_path) as jsonfile:
        new_schema_datatype = json.load(jsonfile)
    
    table_name = new_schema_description['table']
    table_vector = model.encode(new_schema_description['description'])
    table_desc_vectors[table_name] = table_vector

    schema_datatypes[table_name] = new_schema_datatype
    column_vectors = {}
    for col, desc in new_schema_description["columns"].items():
        column_vectors[col] = model.encode(desc)
    schema_desc_vectors[table_name] = column_vectors

In [3]:
join_schema("src/pointx/schema/pointx_fbs_rpt_dly_schema_description.json",
            "src/pointx/schema/pointx_fbs_rpt_dly_columns_type.json")

join_schema("src/pointx/schema/pointx_cust_mly_schema_description.json",
            "src/pointx/schema/pointx_cust_mly_columns_type.json")

join_schema("src/pointx/schema/pointx_keymatrix_dly_schema_description.json",
            "src/pointx/schema/pointx_keymatrix_dly_columns_type.json")

In [33]:
def filter_schema(question:str, column_threshold:float = 0.4, table_threshold:float = 0.2, filter_tables:bool = True):
    question_emb = model.encode(question)
    used_schemas = {}
    found_table = []

    # string matching with table, coumn and question tokens
    for token in question.split():
        found_columns = []
        if token in schema_desc_vectors.keys():
            print("Table offset  ---->", token)
            found_table.append(token)
        for table, column in schema_desc_vectors.items():
            if token in column.keys(): 
                found_columns.append(token)
                print("Column matching  --->",token)
    
    if filter_tables:       #filter table before
        used_tables = []
        for table_name, table_vector in table_desc_vectors.items():
            if util.cos_sim(table_vector, question_emb) >= table_threshold: 
                used_tables.append(table_name)
    else: used_tables = list(table_desc_vectors.keys())     # filtering schema all columns

    for table in used_tables:
        if table in found_table: table_offset = 0.1
        else: table_offset = 0
        used_schemas[table] = []
        for column, column_vector in schema_desc_vectors[table].items():
            if (util.cos_sim(column_vector, question_emb) >= (column_threshold - table_offset)
                or column in found_columns):
                used_schemas[table].append(column)
    
    return used_schemas

In [35]:
question = "Which event_date has largest transaction of pointx_fbs_rpt_dly table"
result = filter_schema(question, column_threshold=0.4, table_threshold=0.3,filter_tables=False)
for table, columns in result.items():
    print(f"Table : {table}")
    print(f"Selected columns : {columns}")

Column matching ---> event_date
Table offset  ----> pointx_fbs_rpt_dly
Table : pointx_fbs_rpt_dly
Selected columns : ['event_date', 'event_month', 'event_bundle_sequence_id', 'event_timestamp', 'event_name', 'event_previous_timestamp', 'device_web_info_hostname', 'geo_region', 'event_id', 'transaction_status', 'transaction_type', '_date']
Table : pointx_cust_mly
Selected columns : ['date_from_last_financial', 'date_last_financial', 'days_from_last_financial']
Table : pointx_keymatrix_dly
Selected columns : ['mtd1_ncust_visit', 'mtd1_ncust_pointx_visit', 'mtd1_ncust_pointx_financial', 'mtd1_ncust_guest_visit', 'mtd1_amt_point_topup', 'mtd1_amt_point_transfer_out', 'mtd1_amt_point_pay', 'mtd1_amt_point_pay_sku', 'mtd1_amt_point_pay_pyw', 'mtd1_n_topup_point', 'mtd1_n_topup_point_onboard', 'mtd1_n_purchase', 'mtd1_n_point_payment_p_only', 'mtd1_amt_point_transfer_out_extnl', '_date']
