In [1]:
# encoding=gbk
import configparser
import time

from torch import Tensor
from typing import Callable
import os
import torch
from transformers import AutoTokenizer, AutoModel
import pickle
import sys
import numpy as np

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
CONFIG_PATH = "./config.ini"


class VecSim:
    # cos similarity
    @staticmethod
    def cos_sim(a: Tensor, b: Tensor):
        """
        Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
        :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
        """
        if not isinstance(a, torch.Tensor):
            a = torch.tensor(a)

        if not isinstance(b, torch.Tensor):
            b = torch.tensor(b)

        if len(a.shape) == 1:
            a = a.unsqueeze(0)

        if len(b.shape) == 1:
            b = b.unsqueeze(0)

        a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
        b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
        return float(torch.mm(a_norm, b_norm.transpose(0, 1)))


def process_bar(num, total):
    rate = float(num) / total
    rate_num = int(100 * rate)
    r = '\r[{}{}]{}%'.format('*' * rate_num, ' ' * (100 - rate_num), rate_num)
    sys.stdout.write(r)
    sys.stdout.flush()


class Embedder:
    def __init__(self, question_data: list, answer_data: list, cache_addr: str):
        self.__question_data = question_data
        self.__answer_data = answer_data
        self.__cp = configparser.ConfigParser()
        self.__cp.read(CONFIG_PATH)
        self.embedding: list[tuple] = []
        self.__cache_addr = cache_addr
        # Load model from HuggingFace Hub
        configs = configparser.ConfigParser()
        configs.read("config.ini")
        # Load model
        self.__tokenizer = AutoTokenizer.from_pretrained(configs['model']['embedding_path'])
        self.__model = AutoModel.from_pretrained(configs['model']['embedding_path'])

    @staticmethod
    def __mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def get_embedding(self, chunk: str):
        # Tokenize sentences
        encoded_input = self.__tokenizer(chunk, padding=True, truncation=True, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = self.__model(**encoded_input)
        # Perform pooling. In this case, max pooling.
        return self.__mean_pooling(model_output, encoded_input['attention_mask'])

    def processing(self):
        if os.path.isfile(self.__cache_addr):
            cache = pickle.load(open(self.__cache_addr, 'rb'))
            self.embedding = cache
        else:
            total_chunk = len(self.__question_data)
            for chunk_idx, chunk in enumerate(self.__question_data):
                self.embedding.append((chunk, self.get_embedding(chunk), answers[chunk_idx]))
                process_bar(chunk_idx, total_chunk - 1)
            pickle.dump(self.embedding, open(self.__cache_addr, "wb"))

    def fetch_similarity(self, query: str, similarity_func: Callable = VecSim.cos_sim):
        query_vec = self.get_embedding(query)
        for em in self.embedding:
            yield similarity_func(query_vec, em[1])

    def get_embedding_data(self, idx: int):
        return self.embedding[idx][0], self.embedding[idx][2]

    def rm_cache(self):
        if os.path.isfile(self.__cache_addr):
            os.remove(self.__cache_addr)

## Start here
load your data in the next column to avoid running out of memory

In [2]:
config = configparser.ConfigParser()
config.read(CONFIG_PATH)

with open("data.bin", 'rb') as f:
    questions, answers, _ = pickle.load(f)

# embedding knowledge dataset
embedder = Embedder(question_data=questions, answer_data=answers, cache_addr="cache")
embedder.rm_cache()  # remove cache if you need, comment it out once embedding data for the first time
print("embedding in progress...")
embedder.processing()

# loading llm model
print("\nloading llm...")
llm_tokenizer = AutoTokenizer.from_pretrained(config['model']['llm_path'], trust_remote_code=True)
llm_model = AutoModel.from_pretrained(config['model']['llm_path'], trust_remote_code=True).cuda()
llm_model = llm_model.eval()
print('finish!')

embedding in progress...
[****************************************************************************************************]100%
loading llm...


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

finish!


In [6]:
similarity_vec = []
query = input("your query: ")
for i in embedder.fetch_similarity(query, VecSim.cos_sim):
    similarity_vec.append(i)

config = configparser.ConfigParser()
config.read(CONFIG_PATH)

select_item_num = min(len(similarity_vec), int(config['model']['max_item']))
top_index = np.argsort(similarity_vec)[-select_item_num:][::-1]

template = open("template", "rb").read().decode("utf-8").split('%')

llm_input = template[0].replace("{1}", query)

matched_text = ""
possible_answer = {}
for ele_idx, idx in enumerate(top_index):
    if similarity_vec[idx] < float(config['model']['min_sup']):
        break
    cur_matched = embedder.get_embedding_data(idx)
    possible_answer[ele_idx+1] = cur_matched
    matched_text += f"{ele_idx+1}: {cur_matched[0]}答案是{cur_matched[1]} "

llm_input = llm_input.replace("{2}", matched_text)
template[0] = llm_input

In [7]:
print(f"USER: {query}")
history = []
current_length = 0

total_update = len(template)
print("BOT: thinking...")
for i in range(total_update):
    if i == total_update-1:
        print("BOT: ", end='')
        for response, _ in llm_model.stream_chat(llm_tokenizer, template[i],history=history, temperature=0.001):
            print(response[current_length:], end='', flush=True)
            current_length = len(response)
    else:
        for response, history in llm_model.stream_chat(llm_tokenizer, template[i],temperature=0.001):
            current_length = len(response)
    current_length = 0

print("\n\nFind similar question:", flush=True)
for k, v in possible_answer.items():
    print(f"{k}: {v[0]}", flush=True)
while True:
    time.sleep(0.3)
    dec = input("USER: ")
    try:
        if dec == "exit":
            break
        if int(dec) <= 0 or int(dec) >= len(possible_answer) + 1:
            raise Exception
        print(f"USER: {dec}")
        print(f"BOT: {possible_answer[int(dec)][1]}")
    except Exception:
        print("BOT: 输入不合法！")

USER: xxx是谁
BOT: thinking...
BOT: 爱化妆的女人,新品种,孔子与孟子有何不同?孔子的子在左边,孟子的子在右边,中国古贤人曾将兰色外衣,浸泡于黄河中,结果产生沾湿。

Find similar question:
1: 谁是世界上最有恒心的画家？ 
2: 黃皮肤的人是黃种人，绿皮肤的人属于哪一种？ 
3: 孔子与孟子有何不同？ 
4: 孔子与孟子有何不同？ 
5: 中国古贤人曾将兰色外衣，浸泡于黄河中，结果产生何种现象？ 
USER: 1
BOT: 爱化妆的女人
USER: 2
BOT: 新品种
USER: 3
BOT: “子”的位置不同。孔子的“子”在左边，孟子的“子”
