### current directory: /home/lzc/mindspore/ChatBabel.ipynb

In [1]:
model_path = '/data1/model/bge1_5-large-zh'
llm_path = '/data1/model/qwen1_5-7b-chat'

## Preparing papers
1. Locate the zip file that contains the papers and unzip them into the `./data` repository.
2. Manually create a .bib file that contains all the metadata for the papers and store them in `./bib_data`

In [2]:
# %%capture captured_output
# !unzip papers_condensed.zip -d ./data

# 1. Preparation for PDF loader

In [3]:
### Preprocess pdf documents
import pdfplumber
import pdftotext
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document
import importlib
import utils
importlib.reload(utils)
from utils import *
import os
import glob

# modified from https://stackoverflow.com/questions/77045559/langchain-load-with-string
def get_text_chunks_langchain(text, title, author):
    """ Turns raw string into docs that conform with docs = loader.load()"""
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len
    )
    docs = [Document(page_content=x, metadata={"title":title, "author":author}) for x in text_splitter.split_text(text)]
    return docs

def load_pdf(filepath, bib_file):
    """ From a pdf, return a docs"""
    text = ""
    match = get_title_author_from_pdf(filepath, bib_file)[0]
    print(match)
    title, author = match
    with open(filepath, 'rb') as f:
        pdf = pdftotext.PDF(f)
        for page in pdf:
            text += page
    return get_text_chunks_langchain(text, title, author)


""" Unit test """
# pdf_files = ["./GA_papers/CMA_ES.pdf", "./GA_papers/SBX.pdf", './GA_papers/HypE.pdf']
directory = './GA_papers/'
pattern = '*.pdf'
pdf_files = glob.glob(os.path.join(directory, pattern))

documents = []
bib_file = "./bib_data/paper_metadata_full.bib"


for pdf_file in pdf_files:
    # print(pdf_file)
    docs = load_pdf(pdf_file, bib_file)
    documents += docs
    
print(documents[-1].metadata)

('A Survey on Evolutionary Computation for Computer Vision and Image Analysis: Past, Present, and Future Trends', 'Bi, Ying')
('Modified Distance Calculation in Generational Distance and Inverted Generational Distance', 'Ishibuchi, Hisao')
('Unknown title', 'Unknown author')
('The Pareto archived evolution strategy: a new baseline algorithm for Pareto multiobjective optimisation', 'Knowles, J.')
('Unknown title', 'Unknown author')
('Unknown title', 'Unknown author')
('A Scalable Multi-objective Test Problem Toolkit', 'Huband, Simon')
('Unknown title', 'Unknown author')
('Performance of Decomposition-Based Many-Objective Algorithms Strongly Depends on Pareto Front Shapes', 'Ishibuchi, Hisao')
('The CMA Evolution Strategy: A Tutorial', 'Hansen, Nikolaus')
('MOEA/D: A Multiobjective Evolutionary Algorithm Based on Decomposition', '{Qingfu Zhang}')
('Unknown title', 'Unknown author')
('Comparison of Multiobjective Evolutionary Algorithms: Empirical Results', 'Zitzler, Eckart')
('A parallel

# 2. Preparation for Retriever

In [4]:
# !pip install transformers torch ipywidgets

In [5]:
### Load model from local files
from transformers import AutoModel, AutoTokenizer

model_path = '/data1/model/bge1_5-large-zh'
llm_path = '/data1/model/qwen1_5-7b-chat'

model = AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [6]:
# !pip install sentence_transformers chromadb

In [7]:
# !pip install -U langchain

In [8]:
### Embed documents into vectordb
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from sentence_transformers import SentenceTransformer

model_path = '/data1/model/bge1_5-large-zh'
# embeddings = SentenceTransformer(model_name_or_path=model_path, local_files_only=True)

# current directory: /home/lzc/mindspore/ChatBabel.ipynb
# embeddings = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-large-zh-v1.5', cache_folder=model_path)

embeddings = HuggingFaceBgeEmbeddings(model_name=model_path)#, cache_folder=model_path)

vector_store = Chroma(embedding_function=embeddings)
vector_store.add_documents(documents)
retriever = vector_store.as_retriever()

""" Unit test """
query = "A crossover operator in the continuous space."
retrieved_docs = retriever.get_relevant_documents(query)
for doc in retrieved_docs:
    print(doc.metadata)
    print(doc.page_content[:250])

{'author': 'Deb, K.', 'title': 'Scalable multi-objective optimization test problems'}
XM) and the objective function values lie on the linear hyper-
plane: E:=, fk = 0.5. A value of k = 5 is suggested
here. In the above problem, the total number of variables
is n = M + k - 1. The difficulty in this problem is to converge
to the hyper-
{'author': 'Deb, Kalyanmoy', 'title': 'Simulated Binary Crossover for Continuous Search Space'}
operators need modified to solve t he above problems , similar reproduct ion
techniques can be used along with the SBX op erator used in t his study t o
investigat e t he efficacy of real-cod ed GA s in mult imodal and mu lt iob jective
problems defi
{'author': 'Deb, Kalyanmoy', 'title': 'Simulated Binary Crossover for Continuous Search Space'}
flexible t he ope rator is in creating an arb itrary point in t he sear ch space. A
number of crit eria for the successful design of a crossover operator ar e suggested in [7] . T hat study shows how different existing c

  warn_deprecated(


In [9]:
""" Unit test """
query = "A crossover operator in the continuous space."
retrieved_docs = retriever.get_relevant_documents(query)
for doc in retrieved_docs:
    print(doc.metadata)
    print(doc.page_content[:250])

{'author': 'Deb, K.', 'title': 'Scalable multi-objective optimization test problems'}
XM) and the objective function values lie on the linear hyper-
plane: E:=, fk = 0.5. A value of k = 5 is suggested
here. In the above problem, the total number of variables
is n = M + k - 1. The difficulty in this problem is to converge
to the hyper-
{'author': 'Deb, Kalyanmoy', 'title': 'Simulated Binary Crossover for Continuous Search Space'}
operators need modified to solve t he above problems , similar reproduct ion
techniques can be used along with the SBX op erator used in t his study t o
investigat e t he efficacy of real-cod ed GA s in mult imodal and mu lt iob jective
problems defi
{'author': 'Deb, Kalyanmoy', 'title': 'Simulated Binary Crossover for Continuous Search Space'}
flexible t he ope rator is in creating an arb itrary point in t he sear ch space. A
number of crit eria for the successful design of a crossover operator ar e suggested in [7] . T hat study shows how different existing c

# 3. Preparation for LLM module

In [10]:
from mindnlp.transformers import AutoModelForCausalLM, AutoTokenizer

llm_path = '/data1/model/qwen1_5-7b-chat'
model = AutoModelForCausalLM.from_pretrained(llm_path)
model.set_train(False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16


Qwen2ForCausalLM<
  (model): Qwen2Model<
    (embed_tokens): Embedding<vocab_size=151936, embedding_size=4096, use_one_hot=False, weight=Parameter (Tensor(shape=[151936, 4096], dtype=Float16, value=[...], name=model.embed_tokens.weight), requires_grad=True), dtype=Float32, padding_idx=None>
    (layers): CellList<
      (0): Qwen2DecoderLayer<
        (self_attn): Qwen2Attention<
          (q_proj): Dense<input_channels=4096, output_channels=4096, has_bias=True>
          (k_proj): Dense<input_channels=4096, output_channels=4096, has_bias=True>
          (v_proj): Dense<input_channels=4096, output_channels=4096, has_bias=True>
          (o_proj): Dense<input_channels=4096, output_channels=4096>
          (rotary_emb): Qwen2RotaryEmbedding<>
          >
        (mlp): Qwen2MLP<
          (gate_proj): Dense<input_channels=4096, output_channels=11008>
          (up_proj): Dense<input_channels=4096, output_channels=11008>
          (down_proj): Dense<input_channels=11008, output_channels=4

In [11]:
tokenizer = AutoTokenizer.from_pretrained(llm_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 4. Asking ChatBabel questions about research.

In [12]:
from mindspore import Tensor
from mindspore import context
from mindnlp.transformers import TextIteratorStreamer
from threading import Thread
import json

def stream_generate_answer(
    input_ids,
    tokenizer,
    model,
    max_new_tokens=300,
    temperature=0.8,
    repetition_penalty=1.0,
    context_len=2048
):
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    max_src_len = context_len - max_new_tokens - 8
    input_ids = input_ids[-max_src_len:]
    
    input_ids = Tensor(input_ids)
    
    generation_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    yield from streamer

def answer(prompt):
	context_str = ""
	retrieved_docs = retriever.get_relevant_documents(prompt)
	for doc in retrieved_docs:
		context_str += json.dumps(doc.metadata)
		context_str += '\n'
		context_str += doc.page_content[:250]
		context_str += '\n'

	PROMPT_TEMPLATE = """基于以下已知信息，简洁和专业的告知用户他们的研究想法是否出现在已知信息的文献中。
	请提供相关条目的标题以及作者，不允许在答案中添加编造成分，答案请使用中文。

	已知信息：
	{context}

	请仔细思考并回答。
	""".format(context=context_str)

	messages = [
		{"role": "system", "content": PROMPT_TEMPLATE},
		{"role": "user", "content": prompt}
	]

	input_ids = tokenizer.apply_chat_template(
		conversation=messages,
		tokenize=True,
		add_generation_prompt=True,
		return_tensors='ms'
	)

	response = ""
	for new_text in stream_generate_answer(input_ids, tokenizer, model):
		response += new_text
	response = response.strip()


	unique_metadata = []
	for retrieved_doc in retrieved_docs:
		metadata = retrieved_doc.metadata
		if metadata['author'] in ('None', 'Unknown author') or metadata['title'] in ('None', 'Unknown title'):
			pass
		elif metadata not in unique_metadata:
			unique_metadata.append(metadata)

	references = "\n参考资料：\n"
	for item in unique_metadata:
		references += json.dumps(item)
		references += '\n'

	return response, references

# prompt = "I have a new idea! For a LLM, it's almost impossible to pre-train from scratch: too costly. A solution is to freeze all the parameters, and create a new low-rank estimation of the original weights and train those weights with reduced parameters. What do you think?"
prompt = """
I have a new idea! for evolutionary algorithms, we usually perform the crossover operation on discrete strings. 
but we could study the probability distribution of the variation operator and mathematically model them in a continuous space to
perform a real-valued crossover operation. what do you think of this idea?
"""

答案是正确的，指出了Deb教授的Simulated Binary Crossover文献。

response, references = answer(prompt)
print(response)
print(references)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

您的研究想法在已知信息的文献中有所体现。Deb教授的"Simulated Binary Crossover for Continuous Search Space"探讨了在连续搜索空间中使用实值编码的遗传算法，这种方法涉及到模拟二进制交叉操作。您提到的将变异操作的概率分布建模到连续空间并进行实值交叉操作，与该研究中讨论的处理连续变量和设计连续编码方式的概念相吻合。然而，具体的数学模型和效果需要通过实验来验证，而您的创新点在于如何将传统二叉交叉策略扩展到实值空间。如果这是对现有工作的一个改进，可能会为优化问题提供新的解决方案。

参考资料：
{"author": "Deb, Kalyanmoy", "title": "Simulated Binary Crossover for Continuous Search Space"}



In [13]:
prompt = """I have a new idea! For a LLM, it's almost impossible to pre-train from scratch: too costly. 
A solution is to freeze all the parameters, and create a new low-rank estimation of the original weights and train those weights with reduced parameters. 
What do you think?"""

答案是正确的，因为数据库里没有这个文献。

response, references = answer(prompt)
print(response)
print(references)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

抱歉，您的研究想法不在已知信息的文献中。文献中提到的内容不涉及预训练模型的参数冻结和低rank估计，而是讨论了多目标优化算法的性能依赖于Pareto前沿形状、遗传局部搜索算法在特定问题上的难度、 niched-penalty 方法在遗传算法中的应用，以及如何处理约束。您的想法与这些主题无关。

参考资料：
{"author": "Ishibuchi, Hisao", "title": "Performance of Decomposition-Based Many-Objective Algorithms Strongly Depends on Pareto Front Shapes"}
{"author": "Ishibuchi, H.", "title": "A multi-objective genetic local search algorithm and its application to flowshop scheduling"}
{"author": "Deb, Kalyanmoy", "title": "A Niched-Penalty Approach for Constraint Handling in Genetic Algorithms"}



In [14]:
prompt = "I would like to create a new genetic algorithm."

可以看到，这个prompt的问题太笼统了。很难作出很好的回答，我们来看看qwen是怎么回答的。

qwen试着去优化用户的研究想法，因为数据库里拥有很多可能跟这个相关的文献，但又很难选出一条具体的文献。甚至可以看到qwen有hallucination的倾向。

response, references = answer(prompt)
print(response)
print(references)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

您的研究想法可能与以下条目相关：

1. "Compaction of symbolic layout using genetic algorithms" by M. P. Fourman (198Swap。这篇论文讨论了使用遗传算法进行符号布局的压缩，虽然它可能不直接涉及新的遗传算法设计，但遗传算法的概念和应用提供了基础。

2. 如果您想比较或评估不同的遗传算法，Zitzler和Eckart的"Comparison of Multiobjective Evolutionary Algorithms: Empirical Results"可能会提供有价值的信息。他们的研究可能包含了对新算法性能的分析。

3. 如果您的新算法与权重选择或组合优化有关，B.Manderick和P.Spiessens的论文"如何选择权重"可能对设计中的选择策略有所启发，尽管他们没有直接提到新算法，但可以参考他们的方法论。

4. 如果您的研究得到了政府资助，如K. Deb在印度的研究，这可能意味着您的工作符合某些研究资助机构对研究方向的期待，但具体是否相关需要查看他们的资助项目细节。

请注意，由于提供的信息不包含具体 FulfillmentDate，所以这些条目可能不完全对应您的最新研究，建议您查阅具体文献以获取最准确的比较。同时，您可能需要创建一个新的研究提案或论文来详细阐述您的新算法。

参考资料：
{"author": "Zitzler, Eckart", "title": "Comparison of Multiobjective Evolutionary Algorithms: Empirical Results"}

