In [6]:
import time
import json
import pandas as pd
from sentence_transformers import util, SentenceTransformer, CrossEncoder

# Sbert Encode Context List
# bi_model_name = 'TingChenChang/qqp-nli-training-paraphrase-multilingual-MiniLM-L12-v2'
bi_model_name = 'TingChenChang/hpv-multi-qa-mpnet-zh'
bi_encoder = SentenceTransformer(bi_model_name, device='cpu')

# cross_model_name = ''
# cross_encoder = CrossEncoder(cross_model_name, device='cpu')

# model_name_base = 'TingChenChang/make-multilingual-en-zh-tw-20220825062338'
# model_base = SentenceTransformer(model_name_base, device='cpu')

In [6]:
import pandas as pd
from sentence_transformers import util, SentenceTransformer, CrossEncoder
import time

class SBERTQAModel:
    def __init__(self, 
                 bi_model_name:str=None,
                 cross_model_name:str=None) -> None:
        
        t = time.perf_counter()
        if bi_model_name is None:
            bi_model_name = 'TingChenChang/qqp-nli-training-paraphrase-multilingual-MiniLM-L12-v2'
        
        if cross_model_name is None:
            cross_model_name = 'TingChenChang/cross-encoder-qqp-lcqmc-training-paraphrase-multilingual-MiniLM-L12-v2'
        
        self.bi_encoder = SentenceTransformer(bi_model_name, device='cpu')
        self.cross_encoder = CrossEncoder(cross_model_name, device='cpu')
        print(f'Model Load: {time.perf_counter() - t:.6f}s')
        
    def get_faq_data(self, file_path:str=None):
        if file_path is None:
            # file_path = 'question_classify.csv'
            file_path = 'HPV QA Collection - add_in_other_words .csv'
        
        self.faq_df = pd.read_csv(file_path)\
            .assign(
                question=lambda df:df['question'].apply(lambda x:x.replace(' ','').strip()),
            )
        self.faq_question_list = self.faq_df['question'].unique().tolist()
        
        t = time.perf_counter()
        self.faq_question_embedding = self.bi_encoder.encode(self.faq_question_list)
        print(f'Question List Encode: {time.perf_counter() - t:.6f}s')
            
    def get_match_question(self, user_question:str, top_k:int=5):

        t = time.perf_counter()
        user_question_embedding = self.bi_encoder.encode(user_question)
        
        print(f' ------------ User Question Encode - {time.perf_counter() - t:.6f}s ------------')
        print(user_question)
        
        self.get_faq_data()

        t = time.perf_counter()
        hits = util.semantic_search(
            user_question_embedding, 
            self.faq_question_embedding, 
            top_k=top_k
            )
        
        self.candidate_question_list = pd.DataFrame(hits[0])\
            .assign(
                question=lambda df:df['corpus_id']\
                    .apply(lambda x:self.faq_question_list[x])
            )\
            [['question','score']]\
            .to_dict('records')
        
        print(' ')
        print(f' ------------ SBERT Candidate - {time.perf_counter() - t:.6f}s ------------')
        for x in self.candidate_question_list:
            print(x)

        # # Cross Encoder
        # t = time.perf_counter()
        # question_pair = list(zip([user_question]*len(self.candidate_question_list), [x['question'] for x in self.candidate_question_list]))
        # scores = self.cross_encoder.predict(question_pair)
        # # Match Question
        # match_question = {q:s for q, s in zip([x['question'] for x in self.candidate_question_list], scores)}
        # match_question = sorted(match_question.items(), key=lambda x:x[1], reverse=True)
        
        # print(' ')
        # print(f' ------------ Two Question Pair - {time.perf_counter() - t:.6f}s ------------')
        
        # return match_question
        
sbert_model = SBERTQAModel(
    # bi_model_name='TingChenChang/hpv-multi-qa-mpnet-zh',
    # bi_model_name='TingChenChang/make-multilingual-en-zh-tw-20220825062338'
    bi_model_name='qqp-training/models/hpv-qqp-para-multi-mpnet'
) 

Model Load: 15.969239s


In [13]:
sbert_model.get_match_question('老人有需要接種追加劑嗎', 5)

 ------------ User Question Encode - 0.043937s ------------
老人有需要接種追加劑嗎
Question List Encode: 4.466869s
 
 ------------ SBERT Candidate - 0.001658s ------------
{'question': '什麼對象可以打追加劑？', 'score': 0.7261231541633606}
{'question': '接種之後5年，需不需要再接種?', 'score': 0.6956974267959595}
{'question': '什麼對象可以打第二次追加劑？', 'score': 0.6668354272842407}
{'question': '長者若行動不便，有到宅接種服務嗎？', 'score': 0.6325038075447083}
{'question': 'COVID-19疫苗適合接種在哪一個年齡層?', 'score': 0.631677508354187}


In [12]:
# sbert_model.candidate_question_list

In [5]:
from sentence_transformers.util import cos_sim
cos_sim(
    sbert_model.bi_encoder.encode('遮光'),
    sbert_model.bi_encoder.encode('防曬')
)

tensor([[0.6511]])

In [3]:
import pandas as pd
data = pd.read_clipboard()
context_list = data['question'].unique().tolist()
len(context_list)

import json
json.dump(context_list, open('context.json','w'))

In [14]:
model = SentenceTransformer('qqp-training/models/hpv-qqp-para-multi-mpnet')
model.save_to_hub(
    repo_name='hpv-qqp-para-multi-mpnet',
    local_model_path='qqp-training/models/hpv-qqp-para-multi-mpnet',
    exist_ok=True,
    replace_model_card=True
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/TingChenChang/hpv-qqp-para-multi-mpnet into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin:   0%|          | 32.0k/1.04G [00:00<?, ?B/s]

Upload file tokenizer.json:   0%|          | 32.0k/16.3M [00:00<?, ?B/s]

Upload file sentencepiece.bpe.model:   1%|          | 32.0k/4.83M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/TingChenChang/hpv-qqp-para-multi-mpnet
   47177da..65765e4  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/TingChenChang/hpv-qqp-para-multi-mpnet/commit/65765e42aaeb1d61c94b50a07120e5368b0f0692'