In [75]:
from semantic_router import Route
from semantic_router.encoders import OpenAIEncoder, HuggingFaceEncoder
from semantic_router import RouteLayer
import pandas as pd

In [76]:
# Load the dataframes json files
df_synthetic = pd.read_json("synthetic_intetions.json")

X_syn = df_synthetic[['Id','Message']]
y_syn = df_synthetic['Intention'].to_list()

In [77]:
# Load the dataframes json files
df_new = pd.read_json("new_intentions.json")

X_new = df_new[['Id','Message']]
y_new = df_new['Intention'].to_list()

In [78]:
from sklearn.model_selection import train_test_split

# Split the dataset with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_syn, y_syn, test_size=0.1, random_state=0, stratify=y_syn
)

In [79]:
# Concatenate the train and new dataframes
X_final = pd.concat([X_train, X_new], ignore_index=True)

# Concatenate the train and new labels
y_final = y_train + y_new

In [80]:
# Replace "None" with None
y_final = [None if i == "None" else i for i in y_final]
y_test = [None if i == "None" else i for i in y_test]

In [81]:
y_train

['manage_personal_info',
 'leverage_rag',
 'manage_personal_info',
 'None',
 'search_scholarships_and_internationals',
 'search_scholarships_and_internationals',
 'matchmaking',
 'matchmaking',
 'search_scholarships_and_internationals',
 'manage_personal_info',
 'company_info',
 'manage_personal_info',
 'search_scholarships_and_internationals',
 'company_info',
 'query_matches',
 'query_matches',
 'leverage_rag',
 'company_info',
 'leverage_rag',
 'matchmaking',
 'manage_personal_info',
 'manage_personal_info',
 'manage_personal_info',
 'search_scholarships_and_internationals',
 'company_info',
 'company_info',
 'manage_personal_info',
 'leverage_rag',
 'manage_personal_info',
 'None',
 'manage_personal_info',
 'search_universities',
 'leverage_rag',
 'leverage_rag',
 'company_info',
 'None',
 'manage_personal_info',
 'search_scholarships_and_internationals',
 'search_scholarships_and_internationals',
 'company_info',
 'search_universities',
 'manage_personal_info',
 'matchmaking',
 'c

In [82]:
user_intentions = ["manage_personal_info", "search_scholarships_and_internationals","search_universities" , "matchmaking", "query_matches", "leverage_rag", "company_info", None]

In [83]:
MESSAGES_INFO = {
}

for intention in user_intentions:
    MESSAGES_INFO[intention] = []

for message, label in zip(X_test["Message"], y_test):
    for intention in user_intentions:
        if label == intention:
            MESSAGES_INFO[intention].append(message)
            continue

MESSAGES_INFO

{'manage_personal_info': ['Please modify my education level to high school.',
  'Please add a preference for urban universities.',
  'What is my registered age?',
  'Please update my password to Passw0rd!',
  'I want to change my username to TechGuru123.',
  'Update my password to NewPassword123!'],
 'search_scholarships_and_internationals': ['Are there scholarships for studying technology abroad?',
  'Can you list scholarships available for STEM students?',
  'What scholarships are available for international students?',
  'Can you find universities with scholarships in Australia?',
  'What international scholarships are available for undergraduate students?',
  'Can you provide details on the Erasmus+ scholarship?'],
 'search_universities': ['Search for universities that teach environmental studies.',
  'What universities have courses in event management?',
  'Are there universities with programs in social work?',
  'What are the top universities for studying sociology?',
  'Search f

In [84]:
manage_personal_info = Route(
    name="manage_personal_info",
    description="The user wants to manage his personal information, such as username, age, password, et cetera...",
    utterances=MESSAGES_INFO['manage_personal_info'],
)

company_info = Route(
    name="company_info",
    description="The user wants to know information about the company called 'UniMatch'.",
    utterances=MESSAGES_INFO['company_info'],
)

leverage_rag = Route(
    name="leverage_rag",
    description="The user intends wants the chatbot to extract information from an external source (PDF or website).",
    utterances=MESSAGES_INFO['leverage_rag'],
)

matchmaking = Route(
    name="matchmaking",
    description="The user wants to make matches for his university, specifying some preferences.",
    utterances=MESSAGES_INFO['matchmaking'],
)

query_matches = Route(
    name="query_matches",
    description="The user wishes to access and see the previously-made matches.",
    utterances=MESSAGES_INFO['query_matches'],
)

search_scholarships_and_internationals = Route(
    name="search_scholarships_and_internationals",
    description="The user wishes to search for scholarships or international opportunities.",
    utterances=MESSAGES_INFO['search_scholarships_and_internationals'],
)

search_universities = Route(
    name="search_universities",
    description="The user wishes to search for universities or courses.",
    utterances=MESSAGES_INFO['search_universities'],
)
routes = [manage_personal_info, company_info, leverage_rag, matchmaking, query_matches, search_scholarships_and_internationals, search_universities]



# Baseline Routers

In [85]:
# Standard HF-Encoded RL
encoderHF = HuggingFaceEncoder()
hf_rl = RouteLayer(encoder=encoderHF, routes=routes) #aggregation = "mean", "max" or "sum". #top_k = 5

accuracy = hf_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

# Call the fit method on HFEncoder
hf_rl.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)
accuracy = hf_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy (Post-fit): {accuracy*100:.2f}%")


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 95.24%


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy (Post-fit): 92.86%


In [86]:
# Standard OA-Encoded RL
encoderOA = OpenAIEncoder()
oa_rl = RouteLayer(encoder=encoderOA, routes=routes) #aggregation = "mean", "max" or "sum" #top_k = 5

accuracy = oa_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

# OpenAIEncoder Fit
oa_rl.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)
accuracy = oa_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy (Post-fit): {accuracy*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 95.24%


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy (Post-fit): 95.24%


In [23]:
# Result: both are the same, choose OA

# Tuning Selected Router

## Aggregate

In [88]:
oa_rl_mean = RouteLayer(encoder=encoderOA, routes=routes, aggregation='mean') #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_max = RouteLayer(encoder=encoderOA, routes=routes, aggregation='max') #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_sum = RouteLayer(encoder=encoderOA, routes=routes, aggregation='sum') #aggregation = "mean", "max" or "sum" #top_k = 5


In [89]:
oa_rl_mean.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [90]:
oa_rl_max.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [91]:
oa_rl_sum.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [92]:
accuracy_mean = oa_rl_mean.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_max = oa_rl_max.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_sum = oa_rl_sum.evaluate(X=X_test["Message"].to_list(), y=y_test)

print(f"Accuracy mean: {accuracy_mean*100:.2f}%")
print(f"Accuracy max: {accuracy_max*100:.2f}%")
print(f"Accuracy sum: {accuracy_sum*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy mean: 97.62%
Accuracy max: 97.62%
Accuracy sum: 95.24%


In [98]:
oa_rl_max_2 = RouteLayer(encoder=encoderOA, routes=routes, aggregation='max', top_k=1) #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_max_5 = RouteLayer(encoder=encoderOA, routes=routes, aggregation='max', top_k = 5) #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_max_10 = RouteLayer(encoder=encoderOA, routes=routes, aggregation='max', top_k = 50) #aggregation = "mean", "max" or "sum" #top_k = 5



In [97]:
oa_rl_max_2.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [96]:
oa_rl_max_5.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [99]:
oa_rl_max_10.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [100]:
accuracy_2 = oa_rl_max_2.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_5 = oa_rl_max_5.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_10 = oa_rl_max_10.evaluate(X=X_test["Message"].to_list(), y=y_test)

print(f"Accuracy k=2: {accuracy_2*100:.2f}%")
print(f"Accuracy k=5: {accuracy_5*100:.2f}%")
print(f"Accuracy k=15: {accuracy_10*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy k=2: 97.62%
Accuracy k=5: 97.62%
Accuracy k=15: 97.62%


# Final RL


In [64]:
final_rl = oa_rl_max_2


# Print metrics

In [65]:
def evaluate_router(rl):
    REPORT_DATA = {
}
    for intention in user_intentions:
        REPORT_DATA[intention] = [0, 0]

    for (index, row), label in zip(X_test.iterrows(), y_test):
        message = row["Message"]
        prediction = rl(message)

        total, wrong = REPORT_DATA[label]
        total += 1
        
        if prediction.name == label:
            pass
        else:
            wrong += 1
        REPORT_DATA[label] = [total, wrong]
    
    return REPORT_DATA


In [66]:
pd.DataFrame(evaluate_router(final_rl)).T.to_excel('evaluation_results.xlsx')

In [67]:
REPORT_DATA = {
}

In [68]:
for intention in user_intentions:
    REPORT_DATA[intention] = [0, 0]


In [69]:
for (index, row), label in zip(X_test.iterrows(), y_test):
    message = row["Message"]
    prediction = final_rl(message)

    total, wrong = REPORT_DATA[label]
    total += 1
    
    if prediction.name == label:
        pass
    else:
        wrong += 1
    REPORT_DATA[label] = [total, wrong]


In [70]:
final_rl('I want to change my password')

RouteChoice(name='manage_personal_info', function_call=None, similarity_score=None)

In [71]:
for (index, row), label in zip(X_train.iterrows(), y_train):
    message = row["Message"]
    prediction = final_rl(message)

    if label == 'None':
        label = None
    
    if prediction.name == label:
        continue
    else:
        print(prediction, label)


name=None function_call=None similarity_score=None manage_personal_info
name=None function_call=None similarity_score=None manage_personal_info
name=None function_call=None similarity_score=None manage_personal_info
name='search_scholarships_and_internationals' function_call=None similarity_score=None None
name=None function_call=None similarity_score=None search_universities
name='matchmaking' function_call=None similarity_score=None search_scholarships_and_internationals
name='matchmaking' function_call=None similarity_score=None search_scholarships_and_internationals
name='query_matches' function_call=None similarity_score=None manage_personal_info
name='matchmaking' function_call=None similarity_score=None search_universities
name='matchmaking' function_call=None similarity_score=None search_universities
name='matchmaking' function_call=None similarity_score=None search_universities
name='search_scholarships_and_internationals' function_call=None similarity_score=None matchmaking
n

In [73]:
pd.DataFrame(REPORT_DATA).T

Unnamed: 0,0,1
manage_personal_info,6,0
search_scholarships_and_internationals,6,0
search_universities,6,0
matchmaking,5,0
query_matches,5,0
leverage_rag,5,0
company_info,6,0
,3,0


# Deploy

In [74]:
final_rl.to_json("layer.json") # This one avoids overfitting the most

[32m2024-12-26 15:11:52 INFO semantic_router.utils.logger Saving route config to layer.json[0m
