In [None]:
!git clone https://github.com/mindspore-lab/mindnlp.git

In [None]:
%cd mindnlp
!git checkout ef64a3b83097c9578bb0d5326f905beeb5b50e1d
!bash scripts/build_and_reinstall.sh

In [None]:
%cd ../
!pip install -r requirements.txt

# 导入相关包

In [1]:
import os
import PyPDF2
import markdown
import json
import tiktoken
from bs4 import BeautifulSoup
import re
import os
from copy import copy
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from tqdm import tqdm
from mindspore import Tensor

enc = tiktoken.get_encoding("cl100k_base")

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


# 2. 读取文件
1. 读取文件：读取对应文件夹下所有文件。

2. 提取内容：判断文件类型，设计提取内容方式，实现多种格式统一化处理。

3. 分块：采用基于最大 token 长度和覆盖内容的逻辑分割长文本，确保段落间的语义连续性。

In [2]:
class ReadFiles:
    """
    class to read files
    """

    def __init__(self, path: str) -> None:
        self._path = path
        self.file_list = self.get_files()
    
    @classmethod
    def read_pdf(cls, file_path: str):
        # 读取PDF文件
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
            return text

    @classmethod
    def read_markdown(cls, file_path: str):
        # 读取Markdown文件
        with open(file_path, 'r', encoding='utf-8') as file:
            md_text = file.read()
            html_text = markdown.markdown(md_text)
            # 使用BeautifulSoup从HTML中提取纯文本
            soup = BeautifulSoup(html_text, 'html.parser')
            plain_text = soup.get_text()
            # 使用正则表达式移除网址链接
            text = re.sub(r'http\S+', '', plain_text) 
            return text

    @classmethod
    def read_text(cls, file_path: str):
        # 读取文本文件
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    
    def get_files(self):
        # args：dir_path，目标文件夹路径
        file_list = []
        for filepath, dirnames, filenames in os.walk(self._path):
            # os.walk 函数将递归遍历指定文件夹
            for filename in filenames:
                # 通过后缀名判断文件类型是否满足要求
                if filename.endswith(".md"):
                    # 如果满足要求，将其绝对路径加入到结果列表
                    file_list.append(os.path.join(filepath, filename))
                elif filename.endswith(".txt"):
                    file_list.append(os.path.join(filepath, filename))
                elif filename.endswith(".pdf"):
                    file_list.append(os.path.join(filepath, filename))
        return file_list

    def get_content(self, max_token_len: int = 600, cover_content: int = 150):
        docs = []
        # 读取文件内容
        for file in self.file_list:
            content = self.read_file_content(file)
            chunk_content = self.get_chunk(
                content, max_token_len=max_token_len, cover_content=cover_content)
            docs.extend(chunk_content)
        return docs

    @classmethod
    def get_chunk(cls, text: str, max_token_len: int = 600, cover_content: int = 150):
        chunk_text = []

        curr_len = 0
        curr_chunk = ''

        token_len = max_token_len - cover_content
        lines = text.splitlines()  # 假设以换行符分割文本为行

        for line in lines:
            line = line.replace(' ', '')
            line_len = len(enc.encode(line))
            if line_len > max_token_len:
                # 如果单行长度就超过限制，则将其分割成多个块
                num_chunks = (line_len + token_len - 1) // token_len
                for i in range(num_chunks):
                    start = i * token_len
                    end = start + token_len
                    # 避免跨单词分割
                    while not line[start:end].rstrip().isspace():
                        start += 1
                        end += 1
                        if start >= line_len:
                            break
                    curr_chunk = curr_chunk[-cover_content:] + line[start:end]
                    chunk_text.append(curr_chunk)
                # 处理最后一个块
                start = (num_chunks - 1) * token_len
                curr_chunk = curr_chunk[-cover_content:] + line[start:end]
                chunk_text.append(curr_chunk)
                
            if curr_len + line_len <= token_len:
                curr_chunk += line
                curr_chunk += '\n'
                curr_len += line_len
                curr_len += 1
            else:
                chunk_text.append(curr_chunk)
                curr_chunk = curr_chunk[-cover_content:]+line
                curr_len = line_len + cover_content

        if curr_chunk:
            chunk_text.append(curr_chunk)

        return chunk_text

    @classmethod
    def read_file_content(cls, file_path: str):
        # 根据文件扩展名选择读取方法
        if file_path.endswith('.pdf'):
            return cls.read_pdf(file_path)
        elif file_path.endswith('.md'):
            return cls.read_markdown(file_path)
        elif file_path.endswith('.txt'):
            return cls.read_text(file_path)
        else:
            raise ValueError("Unsupported file type")



In [None]:
text = ReadFiles('./data').get_content(max_token_len=600, cover_content=150)  # 获得data目录下的所有文件内容并分割
text

# 3. 设计Embedding

In [3]:
class BaseEmbeddings:
    """
    Base class for embeddings
    """

    def __init__(self, path: str, is_api: bool) -> None:
        self.path = path
        self.is_api = is_api

    def get_embedding(self, text: str, model: str) -> List[float]:
        raise NotImplementedError

    @classmethod
    def cosine_similarity(cls, vector1: List[float], vector2: List[float]) -> float:
        """
        calculate cosine similarity between two vectors
        """
        dot_product = np.dot(vector1, vector2)
        magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2)
        if not magnitude:
            return 0
        return dot_product / magnitude

In [4]:
class MindNLPEmbedding(BaseEmbeddings):
    """
    class for MindNLP embeddings
    """
    def __init__(self, path: str = 'BAAI/bge-base-zh-v1.5', is_api: bool = False) -> None:
        super().__init__(path, is_api)
        self._model = self.load_model(path)

    def get_embedding(self, text: str):
        sentence_embedding = self._model.encode([text], normalize_embeddings=True)
        return sentence_embedding

    def load_model(self, path: str):
        from mindnlp.sentence import SentenceTransformer
        model = SentenceTransformer(path)
        return model

    @classmethod
    def cosine_similarity(cls, sentence_embedding_1, sentence_embedding_2):
        """
        calculate similarity between two vectors
        """
        similarity = sentence_embedding_1 @ sentence_embedding_2.T
        return similarity

In [28]:
embedding = MindNLPEmbedding("BAAI/bge-base-zh-v1.5")
embedding._model

No sentence-transformers model found with name BAAI/bge-base-zh-v1.5. Creating a new one with MEAN pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

# 4. 知识库设计

In [5]:
class VectorStore:
    def __init__(self, document: List[str] = ['']) -> None:
        self.document = document

    def get_vector(self, EmbeddingModel: BaseEmbeddings):
        self.vectors = []
        for doc in tqdm(self.document, desc="Calculating embeddings"):
            self.vectors.append(EmbeddingModel.get_embedding(doc))
        return self.vectors

    def persist(self, path: str = 'storage'):
        if not os.path.exists(path):
            os.makedirs(path)
        with open(f"{path}/document.json", 'w', encoding='utf-8') as f:
            json.dump(self.document, f, ensure_ascii=False)
        if self.vectors:
            # 将 numpy.ndarray 转换为列表
            vectors_list = [vector.tolist() for vector in self.vectors]
            with open(f"{path}/vectors.json", 'w', encoding='utf-8') as f:
                json.dump(vectors_list, f)

    def load_vector(self, EmbeddingModel: BaseEmbeddings, path: str = 'storage'):
        with open(f"{path}/vectors.json", 'r', encoding='utf-8') as f:
            vectors_list = json.load(f)
        with open(f"{path}/document.json", 'r', encoding='utf-8') as f:
            self.document = json.load(f)

        # 查询 EmbeddingModel 的类别
        if isinstance(EmbeddingModel, MindNLPEmbedding):
            # 将列表重新变为 numpy.ndarray
            self.vectors = [np.array(vector) for vector in vectors_list]
        else:
            self.vectors = vectors_list

    def get_similarity(self, vector1, vector2, EmbeddingModel: BaseEmbeddings):
        return EmbeddingModel.cosine_similarity(vector1, vector2)

    def query(self, query: str, EmbeddingModel: BaseEmbeddings, k: int = 1):
        # 获取查询字符串的嵌入向量
        query_vector = EmbeddingModel.get_embedding(query)

        # 计算查询向量与数据库中每个向量的相似度
        similarities = [self.get_similarity(query_vector, vector, EmbeddingModel) for vector in self.vectors]

        # 将相似度、向量和文档存储在一个列表中
        results = []
        for similarity, vector, document in zip(similarities, self.vectors, self.document):
            results.append({
                'similarity': similarity,
                'vector': vector,
                'document': document
            })
        # 按相似度从高到低排序
        results.sort(key=lambda x: x['similarity'], reverse=True)
        # 获取最相似的 k 个文档
        top_k_documents = [result['document'] for result in results[:k]]

        return top_k_documents

In [30]:
vector = VectorStore(text)
vector.get_vector(EmbeddingModel=embedding)
vector.persist(path='storage')  # 将向量和文档内容保存到storage目录下，下次再用就可以直接加载本地的数据库
vector.load_vector(EmbeddingModel=embedding, path='./storage')  # 加载本地的数据库

No sentence-transformers model found with name BAAI/bge-base-zh-v1.5. Creating a new one with MEAN pooling.
Calculating embeddings: 100%|██████████| 30/30 [00:10<00:00,  2.99it/s]


# 5. 大语言模型

In [6]:
class BaseModel:
    def __init__(self, path: str = '') -> None:
        self.path = path

    def chat(self, prompt: str, history: List[dict], content: str) -> str:
        pass

    def load_model(self):
        pass

In [7]:
PROMPT_TEMPLATE = dict(
    RAG_PROMPT_TEMPALTE="""使用以上下文来回答用户的问题。如果你不知道答案，请输出我不知道。总是使用中文回答。
        问题: {question}
        可参考的上下文：
        ···
        {context}
        ···
        如果给定的上下文无法让你做出回答，请回答数据库中没有这个内容，你不知道。
        有用的回答:""",
    MindNLP_PROMPT_TEMPALTE="""先对上下文进行内容总结,再使用上下文来回答用户的问题。如果你不知道答案，请输出我不知道。总是使用中文回答。
        问题: {question}
        可参考的上下文：
        ···
        {context}
        ···
        如果给定的上下文无法让你做出回答，请回答数据库中没有这个内容，你不知道。
        有用的回答:"""
)

In [8]:
class MindNLPChat(BaseModel):
    def __init__(self, path: str = '') -> None:
        super().__init__(path)
        self.load_model()

    def chat(self, prompt: str, history: List = [], content: str = '') -> str:
        prompt = PROMPT_TEMPLATE['MindNLP_PROMPT_TEMPALTE'].format(question=prompt, context=content)
        response, history = self.model.chat(self.tokenizer, prompt, history, max_length=512)
        return response

    def load_model(self):
        import mindspore
        from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
        self.tokenizer = AutoTokenizer.from_pretrained(self.path, mirror="huggingface")
        self.model = AutoModelForCausalLM.from_pretrained(self.path, ms_dtype=mindspore.float16, mirror="huggingface")

In [9]:
# 没有保存数据库
docs = ReadFiles('./data').get_content(max_token_len=600, cover_content=150)  # 获得data目录下的所有文件内容并分割
vector = VectorStore(docs)
embedding = MindNLPEmbedding("BAAI/bge-base-zh-v1.5")  # 创建EmbeddingModel
vector.get_vector(EmbeddingModel=embedding)
vector.persist(path='storage')  # 将向量和文档内容保存到storage目录下，下次再用就可以直接加载本地的数据库

vector.load_vector(EmbeddingModel=embedding, path='./storage')  # 加载本地的数据库

question = 'git如何新建分支？'

content = vector.query(question, EmbeddingModel=embedding, k=1)
print(content)
chat = MindNLPChat(path='openbmb/MiniCPM-2B-dpo-bf16')
print(chat.chat(question, [], content))

  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.290 seconds.
Prefix dict has been built successfully.
No sentence-transformers model found with name BAAI/bge-base-zh-v1.5. Creating a new one with MEAN pooling.


[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB


Calculating embeddings: 100%|██████████| 30/30 [00:08<00:00,  3.38it/s]


['远程仓库，可以使用gitpush命令。通常，这个命令后面会跟远程仓库的名称和要推送的分支名称。\nbash\ngitpush<remote-name><branch-name>\n例如，将本地的master分支推送到origin远程仓库：\nbash\ngitpushoriginmaster\n从远程仓库拉取\n从远程仓库获取最新的更改并合并到本地分支，可以使用gitpull命令。这个命令会将远程仓库的指定分支的更改拉取到当前分支。bash\ngitpull<remote-name><branch-name>\n例如，从origin远程仓库的master分支拉取最新更改：\nbash\ngitpulloriginmaster\n远程分支管理\n查看远程分支，可以使用gitbranch命令加上-r选项。\nbash\ngitbranch-r\n删除远程分支，可以使用gitpush命令加上--delete选项。\nbash\ngitpush<remote-name>--delete<branch-name>\n例如，删除origin远程仓库的feature分支：\nbash\ngitpushorigin--deletefeature\n远程仓库的协作与贡献\n协作和贡献通常涉及以下步骤：\n\nFork远程仓库。\nCloneFork后的仓库到本地。\n创建新的分支进行开发。\n完成开发后，将分支推送到自己的Fork仓库。\n']


MiniCPMForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`.`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The `seen_tokens` attribute is deprecated.41. Use the `cache_position` model input instead.


要使用Git新建分支，首先需要确保已经安装了Git。新建分支的步骤如下：

1. 打开终端或命令提示符。
2. 切换到你想要创建分支的仓库。
3. 使用`git checkout -b <分支名>`命令创建一个新的分支。例如，如果你想要创建一个名为"new-feature"的分支，你可以输入以下命令：

```
git checkout -b new-feature
```

4. 现在，你已经在本地仓库中创建了一个名为"new-feature"的新分支。

如果你想要将这个分支推送到远程仓库，可以使用`git push`命令。例如，如果你想要将"new-feature"分支推送到origin远程仓库，你可以输入以下命令：


In [None]:
# 保存数据库之后
vector = VectorStore()
vector.load_vector(EmbeddingModel=embedding, path='./storage')  # 加载本地的数据库
question = 'git如何新建分支？'
content = vector.query(question, EmbeddingModel=embedding, k=3)[0]
print(content)
chat = MindNLPChat(path='openbmb/MiniCPM-2B-dpo-bf16')
print(chat.chat(question, [], content))

# 6. Rerank

In [10]:
class BaseReranker:
    """
    Base class for reranker
    """

    def __init__(self, path: str) -> None:
        self.path = path

    def rerank(self, text: str, content: List[str], k: int) -> List[str]:
        raise NotImplementedError

In [11]:
class MindNLPReranker(BaseReranker):
    """
    class for MindNLP reranker
    """

    def __init__(self, path: str = 'BAAI/bge-reranker-base') -> None:
        super().__init__(path)
        self._model= self.load_model(path)

    def rerank(self, text: str, content: List[str], k: int) -> List[str]:
        query_embedding = self._model.encode(text, normalize_embeddings=True)
        sentences_embedding = self._model.encode(sentences=content, normalize_embeddings=True)
        similarity = query_embedding @ sentences_embedding.T
        # 获取按相似度排序后的索引
        ranked_indices = np.argsort(similarity)[::-1]  # 按相似度降序排序
        # 选择前 k 个最相关的候选内容
        top_k_sentences = [content[i] for i in ranked_indices[:k]]
        return top_k_sentences

    def load_model(self, path: str):
        from mindnlp.sentence import SentenceTransformer
        model = SentenceTransformer(path)
        return model

In [12]:
# 创建RerankerModel
reranker = MindNLPReranker('BAAI/bge-reranker-base')

vector = VectorStore()
vector.load_vector(EmbeddingModel=embedding, path='./storage')  # 加载本地的数据库

question = 'git如何新建分支？'

# 从向量数据库中查询出最相似的3个文档
content = vector.query(question, EmbeddingModel=embedding, k=3)
print(content)

No sentence-transformers model found with name BAAI/bge-reranker-base. Creating a new one with MEAN pooling.
Some weights of XLMRobertaModel were not initialized from the model checkpoint at BAAI/bge-reranker-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['远程仓库，可以使用gitpush命令。通常，这个命令后面会跟远程仓库的名称和要推送的分支名称。\nbash\ngitpush<remote-name><branch-name>\n例如，将本地的master分支推送到origin远程仓库：\nbash\ngitpushoriginmaster\n从远程仓库拉取\n从远程仓库获取最新的更改并合并到本地分支，可以使用gitpull命令。这个命令会将远程仓库的指定分支的更改拉取到当前分支。bash\ngitpull<remote-name><branch-name>\n例如，从origin远程仓库的master分支拉取最新更改：\nbash\ngitpulloriginmaster\n远程分支管理\n查看远程分支，可以使用gitbranch命令加上-r选项。\nbash\ngitbranch-r\n删除远程分支，可以使用gitpush命令加上--delete选项。\nbash\ngitpush<remote-name>--delete<branch-name>\n例如，删除origin远程仓库的feature分支：\nbash\ngitpushorigin--deletefeature\n远程仓库的协作与贡献\n协作和贡献通常涉及以下步骤：\n\nFork远程仓库。\nCloneFork后的仓库到本地。\n创建新的分支进行开发。\n完成开发后，将分支推送到自己的Fork仓库。\n', 'checkout<branch_name>\n这将使当前工作目录切换到名为<branch_name>的分支上。\n合并分支\n要将一个分支的更改合并到当前分支，可以使用以下命令：\nbash\ngitmerge<branch_name>\n这将把名为<branch_name>的分支合并到当前分支上。\n解决冲突\n在合并分支时，如果发生冲突，需要手动解决冲突。可以通过编辑文件来解决冲突，然后使用以下命令标记文件为已解决冲突的状态：bash\ngitadd<file_name>\n解决完所有冲突后，可以继续合并分支。\n以上是关于Git分支管理的基本操作。\n远程仓库\n添加远程仓库\n要将本地仓库与远程仓库关联，可以使用以下命令：\nbash\ngitremoteaddorigin远程仓库地址\n其中，origin是远程仓库的别名，可以

In [13]:
# 从一阶段查询结果中用Reranker再次筛选出最相似的2个文档
rerank_content = reranker.rerank(question, content, k=2)
print(rerank_content)

['是用来隔离开发工作的。每个分支都是一个独立的开发环境，互不影响。分支可以很方便地被创建和合并，因此许多开发者使用分支来进行特性开发、修复bug或者尝试新想法。\nGit的一个核心概念是几乎所有操作都是本地执行的，分支也不例外。这意味着你在本地创建或切换分支，不需要与远程仓库进行通信。\n创建与合并分支\n在Git中创建新分支可以使用gitbranch命令，合并分支则使用gitmerge命令。```bash\n创建新分支\ngitbranch\n切换到新分支\ngitcheckout\n创建新分支并立即切换到该分支\ngitcheckout-b\n合并指定分支到当前分支\ngitmerge\n```\n分支策略\n合理的分支策略可以帮助团队更有效地协作。一种常见的策略是GitFlow，它定义了一个围绕项目发布的分支模型，包括功能分支、发布分支、维护分支等。\n另一种策略是GitHubFlow，它更加简单灵活，适合持续交付的项目。在GitHubFlow中，master分支通常是稳定的，并且随时可以部署。所有新的开发都在基于master的特性分支上进行，一旦完成就可以合并回master。\n解决冲突\n', 'checkout<branch_name>\n这将使当前工作目录切换到名为<branch_name>的分支上。\n合并分支\n要将一个分支的更改合并到当前分支，可以使用以下命令：\nbash\ngitmerge<branch_name>\n这将把名为<branch_name>的分支合并到当前分支上。\n解决冲突\n在合并分支时，如果发生冲突，需要手动解决冲突。可以通过编辑文件来解决冲突，然后使用以下命令标记文件为已解决冲突的状态：bash\ngitadd<file_name>\n解决完所有冲突后，可以继续合并分支。\n以上是关于Git分支管理的基本操作。\n远程仓库\n添加远程仓库\n要将本地仓库与远程仓库关联，可以使用以下命令：\nbash\ngitremoteaddorigin远程仓库地址\n其中，origin是远程仓库的别名，可以根据实际情况自行命名。\n推送到远程仓库\n将本地提交推送到远程仓库可以使用以下命令：\nbash\ngitpushorigin分支名\n例如，将本地的master分支推送到远程仓库可以使用：\nbash\ngitpushorig

In [14]:
# 最后选择最相似的文档, 交给LLM作为可参考上下文
best_content = rerank_content[0]
print(chat.chat(question, [], best_content))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


要使用Git新建分支，你可以使用`git branch`命令。首先，输入以下命令创建一个新的分支：

```bash
git branch new-branch-name
```

其中`new-branch-name`是你想要为分支命名的名称。然后，输入以下命令将新分支切换到你刚刚创建的分支上：

```bash
git checkout new-branch-name
```

现在，你可以开始在你的新分支上进行开发工作。如果需要将新分支合并回主分支，可以使用`git merge`命令。例如，如果你想要将新分支合并回主分支，你可以输入以下命令：

```bash
git merge master
```

其中`master`是你主分支的名称。

请注意，在合并分支时，可能会发生冲突。这时，你需要手动解决冲突，确保所有更改都被正确地合并。
