In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.util import cos_sim

# BI Encoder Model
# 1. distiluse-base-multilingual-cased-v1
# 2. output/training_hpvqa-distiluse-base-multilingual-cased-v1-20220108114007

# Cross Encoder Model 
# 1. output/training_quora-2021-10-21_15-20-55 (2 Epoch)

bi_model_1 = SentenceTransformer('distiluse-base-multilingual-cased-v1')
bi_model_2 = SentenceTransformer('output/training_hpvqa-distiluse-base-multilingual-cased-v1-20220108114007')
bi_model_3 = SentenceTransformer('output/training_hpvqa-training_gossip-distiluse-base-multilingual-cased-v1-20211108123754-20220110002927')

cs_model = CrossEncoder('output/training_quora-2021-10-21_15-20-55')

In [6]:
def sbert_qa_model(user_question, bi_encoder, cross_encoder):

    # FAQ 
    import pandas as pd
    faq_df = pd.read_excel('data/HPV_QA.xlsx')\
        .assign(
            Question=lambda df:df['question'].apply(lambda x:x.replace(' ','').strip()),
            Answer=lambda df:df['answer'].apply(lambda x:x.replace(' ','').strip())
        )
    faq_question_list = faq_df['question'].unique().tolist()
    faq_qa_map = dict(zip(faq_df['question'], faq_df['answer']))

    faq_question_embedding = bi_encoder.encode(faq_question_list)
    
    user_question_embedding = bi_encoder.encode(user_question)

    # Bi Encoder
    ques_to_ques_sim = cos_sim(user_question_embedding, faq_question_embedding)
    # Candidate Top 10
    candidate_question = pd.DataFrame({
        'question':faq_question_list,
        'sim':ques_to_ques_sim[0]
    }).nlargest(2,'sim')['question'].tolist()
    
    # # Cross Encoder
    # question_pair = list(zip([user_question]*len(candidate_question), candidate_question))
    # scores = cross_encoder.predict(question_pair)
    # # Answer
    # match_question = pd.DataFrame({
    #     'question':candidate_question,
    #     'scores':scores[0]
    # }).nlargest(5,'scores')['question'].tolist()[:2]
    # match_score = pd.DataFrame({
    #     'question':candidate_question,
    #     'scores':scores[0]
    # }).nlargest(5,'scores')['scores'].tolist()[:2]
    
    candidate_score = pd.DataFrame({
        'question':faq_question_list,
        'sim':ques_to_ques_sim[0]
    }).nlargest(2,'sim')['sim'].tolist()
    match_answer = [faq_qa_map[x] for x in candidate_question]
    
    # Output
    output = {
        'user_question':user_question,
        'best_match_question':candidate_question,
        'best_match_answer':match_answer,
        'score':candidate_score
    }
    
    return output

In [3]:
default_question_list = [
    '疫苗有幾類', 
    '政府有補助打疫苗嗎', 
    '公費疫苗補助多少',
    '疫苗有年齡限制嗎？', 
    '會有哪些副作用',
    '有需要特別注意什麼', 
    '打完疫苗發燒怎麼辦'
]

In [7]:
for q in default_question_list:
    r1 = sbert_qa_model(q, bi_model_1, cs_model)
    r2 = sbert_qa_model(q, bi_model_2, cs_model)
    r3 = sbert_qa_model(q, bi_model_3, cs_model)
    
    print(f"User Question: {r1['user_question']}")
    print(f"Model1: {list(zip(r1['best_match_question'], r1['score']))}")
    print(f"Model2: {list(zip(r2['best_match_question'], r2['score']))}")
    print(f"Model3: {list(zip(r3['best_match_question'], r3['score']))}")
    
    print('---------')

User Question: 疫苗有幾類
Model1: [('HPV疫苗要接種幾劑', 0.7321795225143433), ('目前政府核准有哪幾種HPV疫苗廠牌?', 0.6231530904769897)]
Model2: [('HPV疫苗要接種幾劑', 0.7184429168701172), ('目前政府核准有哪幾種HPV疫苗廠牌?', 0.6042048931121826)]
Model3: [('HPV疫苗要接種幾劑', 0.783789336681366), ('HPV疫苗的接種對象', 0.7351275682449341)]
---------
User Question: 政府有補助打疫苗嗎
Model1: [('疫苗補助', 0.7323815822601318), ('公費 HPV 疫苗接種，是否強制接種', 0.708055853843689)]
Model2: [('疫苗補助', 0.7179433107376099), ('公費 HPV 疫苗接種，是否強制接種', 0.688248872756958)]
Model3: [('疫苗補助', 0.8300663232803345), ('公費 HPV 疫苗接種，是否強制接種', 0.7567755579948425)]
---------
User Question: 公費疫苗補助多少
Model1: [('疫苗補助', 0.7826864123344421), ('如果看病順便接種公費 HPV 疫苗，需負擔哪些費用', 0.7539790868759155)]
Model2: [('疫苗補助', 0.7704024314880371), ('如果看病順便接種公費 HPV 疫苗，需負擔哪些費用', 0.7409102916717529)]
Model3: [('疫苗補助', 0.8224537968635559), ('如果看病順便接種公費 HPV 疫苗，需負擔哪些費用', 0.8135435581207275)]
---------
User Question: 疫苗有年齡限制嗎？
Model1: [('我女兒還這麼年輕，有需要施打疫苗嗎', 0.6934840679168701), ('接種HPV疫苗期間可以懷孕嗎', 0.6374865770339966)]
Model2: 

In [95]:
for q in ['疫苗補助是否包含葆蓓']:
    r1 = sbert_qa_model(q, bi_model_1, cs_model_2)
    r2 = sbert_qa_model(q, bi_model_2, cs_model_2)
    r3 = sbert_qa_model(q, bi_model_3, cs_model_2)

    print(f"User Question: {r1['user_question']}")
    print(f"Model1: {list(zip(r1['best_match_question'], r1['score']))}")
    print(f"Model2: {list(zip(r2['best_match_question'], r2['score']))}")
    print(f"Model3: {list(zip(r3['best_match_question'], r3['score']))}")
    
    print('---------')

User Question: 疫苗補助是否包含葆蓓
Model1: [('目前疫苗有哪些', 0.0006677709170617163), ('HPV疫苗有用嗎', 0.0006677709170617163)]
Model2: [('我有血小板缺少症，我適合接種GARDASIL 9疫苗嗎？', 0.0005554489325731993), ('我是血小板缺少症患者，是否能夠接種GARDASIL 9疫苗', 0.0005554489325731993)]
Model3: [('接種HPV疫苗有什麼禁忌事項嗎?', 0.0006544439238496125), ('政府採購之HPV疫苗品質是否有 保障?', 0.0006544439238496125)]
---------


In [94]:
for q in ['HPV是麼麼']:
    r1 = sbert_qa_model(q, bi_model_1, cs_model_2)
    r2 = sbert_qa_model(q, bi_model_2, cs_model_2)
    r3 = sbert_qa_model(q, bi_model_3, cs_model_2)

    print(f"User Question: {r1['user_question']}")
    print(f"Model1: {list(zip(r1['best_match_question'], r1['score']))}")
    print(f"Model2: {list(zip(r2['best_match_question'], r2['score']))}")
    print(f"Model3: {list(zip(r3['best_match_question'], r3['score']))}")
    
    print('---------')

User Question: HPV是麼麼
Model1: [('HPV是什麼?', 0.14331158995628357), ('HPV是什麼？', 0.14331158995628357)]
Model2: [('什麼是HPV', 0.024990471079945564), ('HPV是什麼?', 0.024990471079945564)]
Model3: [('HPV是什麼?', 0.14331158995628357), ('HPV是什麼？', 0.14331158995628357)]
---------
