In [151]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sentence_transformers import SentenceTransformer, util
import json, os, math
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [132]:
schema_root_path = "filtering_schema/src/schemas/coffeeshop-descriptions"
column_names = ['Table', 'Column', 'Description']
full_df = pd.DataFrame(columns=column_names)

for file in os.listdir(schema_root_path):
    schema_file_path = os.path.join(schema_root_path, file)
    with open(schema_file_path, 'r') as f:
        schema = json.load(f)

    temp_df = pd.DataFrame(schema['columns'].items() ,columns=['Column', 'Description'])
    temp_df['Table'] = schema['table']
    temp_df = temp_df[column_names]
    full_df = pd.concat([full_df, temp_df], ignore_index=True)

print(full_df.shape)
full_df

(19, 3)


Unnamed: 0,Table,Column,Description
0,happy_hour,HH_ID,Unique identifier for the happy hour event
1,happy_hour,Shop_ID,Identifier of the shop hosting the happy hour
2,happy_hour,Month,Month in which the happy hour takes place
3,happy_hour,Num_of_staff_in_charge,Number of staff members in charge during the h...
4,member,Member_ID,Unique identifier for the member
5,member,Name,Name of the member
6,member,Membership_card,Details of the membership card
7,member,Age,Age of the member
8,member,Time_of_purchase,Date and time of purchase
9,member,Level_of_membership,Level or type of membership


In [133]:
schema_classes_file_path = "src/spider/cofee_shop/coffee_shop_class.json"

with open(schema_classes_file_path, 'r') as f:
    schema_classes = json.load(f)
schema_classes

{'event_detail': 'This class contains columns related to event details, such as happy hour event identifiers, the shop hosting the event, the month in which the event takes place, and the number of staff members in charge during the event.',
 'member_information': 'This class contains columns related to member information, such as member identifiers, member names, membership card details, age of the member, time of purchase, level or type of membership, and address of the member.',
 'shop_information': 'This class contains columns related to shop information, such as shop identifiers, shop addresses, the number of staff members working at the shop, the score or rating of the shop, and the year when the shop opened.'}

In [134]:
sentenceemb_model = SentenceTransformer("filtering_schema/models/all-MiniLM-L6-v2")
for topic, info in schema_classes.items():
    schema_classes[topic] = sentenceemb_model.encode(info)

In [137]:
def most_relate_topic(text:str, min_threshold_score:float=0.2, 
                      topic_select:bool=False, n:int=10):
    text_vec = sentenceemb_model.encode(text)
    topic_scores = [float(util.cos_sim(info, text_vec)) for info in schema_classes.values()]
    if topic_select:
        probs = (topic_scores / np.sum(topic_scores)) * n
        topic_selected = { key: math.ceil(score) for key, score in zip(schema_classes.keys(), probs)}
        return topic_selected
    related_topic_indices = np.where(np.array(topic_scores) >= min_threshold_score)[0]
    related_topics = [list(schema_classes.keys())[i] for i in related_topic_indices]
    return related_topics

In [138]:
full_df['Vector'] = full_df['Description'].apply(lambda x: sentenceemb_model.encode(x))
full_df['Topic'] = full_df['Description'].apply(most_relate_topic, args=(0.3,))
full_df.head()

Unnamed: 0,Table,Column,Description,Vector,Topic
0,happy_hour,HH_ID,Unique identifier for the happy hour event,"[-0.03035756, 0.11128711, 0.009328192, 0.03674...",[event_detail]
1,happy_hour,Shop_ID,Identifier of the shop hosting the happy hour,"[-0.020781243, 0.10354371, -0.022293467, 0.040...","[event_detail, shop_information]"
2,happy_hour,Month,Month in which the happy hour takes place,"[0.053571712, 0.09682498, -0.03012198, 0.02736...",[event_detail]
3,happy_hour,Num_of_staff_in_charge,Number of staff members in charge during the h...,"[0.048624337, 0.03614841, -0.005396093, 0.0233...",[event_detail]
4,member,Member_ID,Unique identifier for the member,"[-0.070519626, 0.01890726, -0.03272294, -0.004...",[member_information]


In [139]:
expanded_df = full_df.explode('Topic')
expanded_df.head()

Unnamed: 0,Table,Column,Description,Vector,Topic
0,happy_hour,HH_ID,Unique identifier for the happy hour event,"[-0.03035756, 0.11128711, 0.009328192, 0.03674...",event_detail
1,happy_hour,Shop_ID,Identifier of the shop hosting the happy hour,"[-0.020781243, 0.10354371, -0.022293467, 0.040...",event_detail
1,happy_hour,Shop_ID,Identifier of the shop hosting the happy hour,"[-0.020781243, 0.10354371, -0.022293467, 0.040...",shop_information
2,happy_hour,Month,Month in which the happy hour takes place,"[0.053571712, 0.09682498, -0.03012198, 0.02736...",event_detail
3,happy_hour,Num_of_staff_in_charge,Number of staff members in charge during the h...,"[0.048624337, 0.03614841, -0.005396093, 0.0233...",event_detail


In [140]:
expanded_df[expanded_df['Topic'] == 'shop_information'].index

Index([1, 8, 11, 12, 13, 14, 15], dtype='int64')

In [164]:
question = 'How many happy hour occur from each shop'
print(question)
def selected_columns(question, max_n=10) -> list:
    _df = expanded_df[['Vector', 'Topic']]
    _df['Score'] = _df['Vector'].apply(lambda x: float(util.cos_sim(x, sentenceemb_model.encode(question))))
    topic_selected = most_relate_topic(question, n=max_n, topic_select=True)
    print(topic_selected)
    related_columns = []
    for topic, num in topic_selected.items():
        selected_col_index = _df[_df['Topic'] == topic]['Score'].sort_values(ascending=False).head(num).index
        related_columns.extend(full_df.loc[selected_col_index, 'Column'].to_list())
        
    return list(set(related_columns))

selected_columns(question)

How many happy hour occur from each shop
{'event_detail': 6, 'member_information': 1, 'shop_information': 4}


['Total_amount',
 'HH_ID',
 'Open_Year',
 'Month',
 'Shop_ID',
 'Score',
 'Num_of_staff_in_charge',
 'Num_of_staff',
 'Member_ID']

In [34]:

# TF-IDF Vectorization
# tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# tfidf_matrix = tfidf_vectorizer.fit_transform(full_df['Description'])

vectorizer = CountVectorizer(max_features=1000, stop_words='english')
dtm = vectorizer.fit_transform(full_df['Description'])

# Apply Latent Dirichlet Allocation (LDA)
n_topics = 3  # You can adjust the number of topics based on your needs
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
# topics = lda.fit_transform(tfidf_matrix)
topics = lda.fit_transform(dtm)

# Assign the most probable topic to each column
full_df['Topic'] = topics.argmax(axis=1)

grouped = full_df.groupby('Topic')
for group_name, group_data in grouped:
    print(f"Category: {group_name}")
    print(group_data[['Column', 'Description']])

Category: 0
                 Column                  Description
8      Time_of_purchase    Date and time of purchase
9   Level_of_membership  Level or type of membership
10              Address        Address of the member
Category: 1
             Column                                        Description
0             HH_ID         Unique identifier for the happy hour event
1           Shop_ID      Identifier of the shop hosting the happy hour
2             Month          Month in which the happy hour takes place
4         Member_ID                   Unique identifier for the member
5              Name                                 Name of the member
6   Membership_card                     Details of the membership card
7               Age                                  Age of the member
11          Shop_ID                     Unique identifier for the shop
16            HH_ID                 Identifier of the happy hour event
17        Member_ID  Identifier of the member particip