In [44]:
from semantic_router import Route
from semantic_router.encoders import OpenAIEncoder, HuggingFaceEncoder
from semantic_router import RouteLayer
import pandas as pd

In [45]:
# Load the dataframes json files
df_synthetic = pd.read_json("synthetic_intetions.json")

X_syn = df_synthetic[['Id','Message']]
y_syn = df_synthetic['Intention'].to_list()

In [46]:
# Load the dataframes json files
df_new = pd.read_json("new_intentions.json")

X_new = df_new[['Id','Message']]
y_new = df_new['Intention'].to_list()

In [55]:
from sklearn.model_selection import train_test_split

# Split the dataset with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_syn, y_syn, test_size=0.3, random_state=0, stratify=y_syn
)

In [56]:
# Concatenate the train and new dataframes
X_final = pd.concat([X_train, X_new], ignore_index=True)

# Concatenate the train and new labels
y_final = y_train + y_new

In [57]:
# Replace "None" with None
y_final = [None if i == "None" else i for i in y_final]
y_test = [None if i == "None" else i for i in y_test]

In [58]:
y_train

['manage_personal_info',
 'manage_personal_info',
 'matchmaking',
 'None',
 'search_scholarships_and_internationals',
 'query_matches',
 'manage_personal_info',
 'company_info',
 'company_info',
 'None',
 'manage_personal_info',
 'leverage_rag',
 'manage_personal_info',
 'manage_personal_info',
 'leverage_rag',
 'company_info',
 'manage_personal_info',
 'None',
 'query_matches',
 'manage_personal_info',
 'manage_personal_info',
 'company_info',
 'manage_personal_info',
 'search_universities',
 'query_matches',
 'manage_personal_info',
 'search_universities',
 'manage_personal_info',
 'None',
 'leverage_rag',
 'matchmaking',
 'manage_personal_info',
 'manage_personal_info',
 'search_scholarships_and_internationals',
 'search_universities',
 'None',
 'leverage_rag',
 'leverage_rag',
 'matchmaking',
 'None',
 'company_info',
 'None',
 'manage_personal_info',
 'company_info',
 'manage_personal_info',
 'query_matches',
 'search_scholarships_and_internationals',
 'search_universities',
 'lev

In [59]:
user_intentions = ["manage_personal_info", "search_scholarships_and_internationals","search_universities" , "matchmaking", "query_matches", "leverage_rag", "company_info", None]

In [60]:
MESSAGES_INFO = {
}

for intention in user_intentions:
    MESSAGES_INFO[intention] = []

for message, label in zip(X_test["Message"], y_test):
    for intention in user_intentions:
        if label == intention:
            MESSAGES_INFO[intention].append(message)
            continue

MESSAGES_INFO

{'manage_personal_info': ['What is my current education level?',
  'I want to change my username to Student2023.',
  "Please change my education level to bachelor's.",
  'Can you tell me my current country?',
  'Change my country to Canada.',
  'Can you set my preferred study environment to quiet?',
  'What is my current education level?',
  'Can you remind me of my password?',
  "I would like to update my education level to master's.",
  'Please modify my age to 30.',
  'What is my current education level?',
  'Please update my password to Passw0rd!',
  'Please add a preference for urban universities.',
  'I want to add a preference for studying abroad.',
  'What is my current age?',
  'Please modify my education level to high school.',
  'Can you provide details about my education level?',
  'What is my registered age?',
  'Can you change my password to something more secure?',
  'Can you tell me my education level?',
  "I'd like to modify my username to Student2023.",
  'Please modi

In [61]:
manage_personal_info = Route(
    name="manage_personal_info",
    description="The user wants to manage and access his personal information, such as username, age, password, et cetera...; this includes the user asking what is he, what are his user preferences, or to modify his details.",
    utterances=MESSAGES_INFO['manage_personal_info'],
)

company_info = Route(
    name="company_info",
    description="The user wants to know information about the company called 'UniMatch'.",
    utterances=MESSAGES_INFO['company_info'],
)

leverage_rag = Route(
    name="leverage_rag",
    description="The user intends wants the chatbot to extract information from an external source (PDF or website).",
    utterances=MESSAGES_INFO['leverage_rag'],
)

matchmaking = Route(
    name="matchmaking",
    description="The user wants to make matches for his university, specifying some preferences.",
    utterances=MESSAGES_INFO['matchmaking'],
)

query_matches = Route(
    name="query_matches",
    description="The user wishes to access and see the previously-made matches.",
    utterances=MESSAGES_INFO['query_matches'],
)

search_scholarships_and_internationals = Route(
    name="search_scholarships_and_internationals",
    description="The user wishes to search for scholarships or international opportunities.",
    utterances=MESSAGES_INFO['search_scholarships_and_internationals'],
)

search_universities = Route(
    name="search_universities",
    description="The user wishes to search for universities or courses.",
    utterances=MESSAGES_INFO['search_universities'],
)
routes = [manage_personal_info, company_info, leverage_rag, matchmaking, query_matches, search_scholarships_and_internationals, search_universities]



# Baseline Routers

In [62]:
# Standard HF-Encoded RL
encoderHF = HuggingFaceEncoder()
hf_rl = RouteLayer(encoder=encoderHF, routes=routes) #aggregation = "mean", "max" or "sum". #top_k = 5

accuracy = hf_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

# Call the fit method on HFEncoder
hf_rl.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)
accuracy = hf_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy (Post-fit): {accuracy*100:.2f}%")


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 96.38%


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy (Post-fit): 94.93%


In [63]:
# Standard OA-Encoded RL
encoderOA = OpenAIEncoder()
oa_rl = RouteLayer(encoder=encoderOA, routes=routes) #aggregation = "mean", "max" or "sum" #top_k = 5

accuracy = oa_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

# OpenAIEncoder Fit
oa_rl.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)
accuracy = oa_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)
print(f"Accuracy (Post-fit): {accuracy*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 98.55%


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy (Post-fit): 98.55%


In [64]:
oa_rl.to_json("layer.json") # This one avoids overfitting the most

[32m2024-12-27 17:39:53 INFO semantic_router.utils.logger Saving route config to layer.json[0m


In [66]:
for (index, row), label in zip(X_train.iterrows(), y_train):
    message = row["Message"]
    prediction = oa_rl(message)

    if label == 'None':
        label = None
    
    if prediction.name == label:
        continue
    else:
        print(message, prediction, "!=", label)


Please find matches for universities that offer scholarships for international students. name='search_scholarships_and_internationals' function_call=None similarity_score=None != matchmaking
Have you ever traveled to a different country? name='manage_personal_info' function_call=None similarity_score=None != None
What are the requirements for studying abroad at universities? name='search_scholarships_and_internationals' function_call=None similarity_score=None != search_universities


KeyboardInterrupt: 

In [23]:
# Result: both are the same, choose OA

# Tuning Selected Router

## Aggregate

In [67]:
oa_rl_mean = RouteLayer(encoder=encoderOA, routes=routes, aggregation='mean') #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_max = RouteLayer(encoder=encoderOA, routes=routes, aggregation='max') #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_sum = RouteLayer(encoder=encoderOA, routes=routes, aggregation='sum') #aggregation = "mean", "max" or "sum" #top_k = 5


In [68]:
oa_rl_mean.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [69]:
oa_rl_max.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [70]:
oa_rl_sum.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [71]:
accuracy_mean = oa_rl_mean.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_max = oa_rl_max.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_sum = oa_rl_sum.evaluate(X=X_test["Message"].to_list(), y=y_test)

print(f"Accuracy mean: {accuracy_mean*100:.2f}%")
print(f"Accuracy max: {accuracy_max*100:.2f}%")
print(f"Accuracy sum: {accuracy_sum*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy mean: 97.10%
Accuracy max: 97.83%
Accuracy sum: 97.83%


In [72]:
oa_rl_max_2 = RouteLayer(encoder=encoderOA, routes=routes, aggregation='sum', top_k=1) #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_max_5 = RouteLayer(encoder=encoderOA, routes=routes, aggregation='sum', top_k = 5) #aggregation = "mean", "max" or "sum" #top_k = 5
oa_rl_max_10 = RouteLayer(encoder=encoderOA, routes=routes, aggregation='sum', top_k = 50) #aggregation = "mean", "max" or "sum" #top_k = 5



In [73]:
oa_rl_max_2.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [74]:
oa_rl_max_5.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [75]:
oa_rl_max_10.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

In [40]:
accuracy_2 = oa_rl_max_2.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_5 = oa_rl_max_5.evaluate(X=X_test["Message"].to_list(), y=y_test)
accuracy_10 = oa_rl_max_10.evaluate(X=X_test["Message"].to_list(), y=y_test)

print(f"Accuracy k=2: {accuracy_2*100:.2f}%")
print(f"Accuracy k=5: {accuracy_5*100:.2f}%")
print(f"Accuracy k=15: {accuracy_10*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy k=2: 89.13%
Accuracy k=5: 89.13%
Accuracy k=15: 89.13%


# Final RL


In [87]:
from sklearn.model_selection import train_test_split

# Split the dataset with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_syn, y_syn, test_size=0.1, random_state=0, stratify=y_syn
)

In [88]:
# Concatenate the train and new dataframes
X_final = pd.concat([X_train, X_new], ignore_index=True)

# Concatenate the train and new labels
y_final = y_train + y_new

In [89]:
# Replace "None" with None
y_final = [None if i == "None" else i for i in y_final]
y_test = [None if i == "None" else i for i in y_test]

In [90]:
final_rl = RouteLayer(encoder=encoderOA, routes=routes, aggregation='sum', top_k = 5)
final_rl.fit(X=X_final["Message"].to_list(), y=y_final, max_iter=500)
acc = final_rl.evaluate(X=X_test["Message"].to_list(), y=y_test)

print(f"Accuracy of final router: {acc*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy of final router: 93.48%


# Print metrics

In [91]:
def evaluate_router(rl):
    REPORT_DATA = {
}
    for intention in user_intentions:
        REPORT_DATA[intention] = [0, 0]

    for (index, row), label in zip(X_test.iterrows(), y_test):
        message = row["Message"]
        prediction = rl(message)

        total, wrong = REPORT_DATA[label]
        total += 1
        
        if prediction.name == label:
            pass
        else:
            wrong += 1
        REPORT_DATA[label] = [total, wrong]
    
    return REPORT_DATA


In [92]:
pd.DataFrame(evaluate_router(final_rl)).T.to_excel('evaluation_results.xlsx')

In [93]:
final_rl.to_json("layer.json") # This one avoids overfitting the most

[32m2024-12-27 18:35:04 INFO semantic_router.utils.logger Saving route config to layer.json[0m
