# 1. 设置环境

In [9]:
import os

os.environ['http_proxy'] = '127.0.0.1:7890'
os.environ['https_proxy'] = '127.0.0.1:7890'

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"

os.environ["DASHSCOPE_API_KEY"] = "sk-62396f4eadf94b50acc7161cfd4b0b0e"

# 2. 导入数据
## 2.1 导入数据方法1：通过TextLoader导入数据

In [2]:
from langchain.document_loaders import TextLoader
home_path = os.getcwd()
data_path = os.path.join(home_path, 'data')
documents = []
for file_name in os.listdir(data_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(data_path, file_name)
        documents.append(file_path)

for i in range(len(documents)):
    docs = TextLoader(documents[i]).load()

## 2.2 导入数据方法2：通过DirectoryLoader导入数据

In [10]:
from langchain.document_loaders import TextLoader, DirectoryLoader

home_path = os.getcwd()
data_path = os.path.join(home_path, 'data')
text_loader_kwargs ={'autodetect_encoding': True}
loader = DirectoryLoader(data_path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs, show_progress=True)
docs = loader.load()

100%|██████████| 8/8 [00:00<00:00, 1674.37it/s]


# 3. 文本分割

In [11]:
from langchain_community.embeddings import DashScopeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

print(len(documents))
print("="*200)
for doc in documents:
    print(len(doc.page_content))

db = Chroma.from_documents(documents, DashScopeEmbeddings())

1134
440
998
543
14
996
239
876
489
993
492
999
323
801
982
612
995
433
842
995
558
987
757
873
991
709
760
951
424
918
992
994
619
495
894
977
961
890
727
979
466
831
632
896
611
970
71
932
995
383
646
557
916
793
997
790
774
670
696
804
974
506
972
997
654
723
951
999
904
686
728
742
850
862
887
967
790
776
758
837
518
194
998
896
979
976
981
847
993
591
995
374
786
933
624
999
884
999
550
632
929
997
837
207
999
495
963
402
927
891
574
530
622
996
812
696
480
994
214
777
998
474
870
313
827
851
874
997
289
717
969
45
999
438
288
995
518
553
972
857
436
862
896
953
994
816
759
991
395
919
918
727
993
230
794
996
202
691
936
595
947
936
920
990
279
50
997
317
997
314
992
216
821
994
792
977
912
949
910
980
930
935
937
978
183
961
996
995
367
995
986
588
999
875
995
995
501
984
998
604
780
958
925
634
924
520
735
692
997
663
990
872
996
319
638
607
930
823
902
738
456
987
459
915
901
965
832
999
386
972
649
990
481
496
606
765
962
972
656
999
220
638
851
978
996
557
833
806
993
999
325

# 4.运行

## 4.1运行方法1

In [5]:
retriever = db.as_retriever()
retrieved_docs = retriever.invoke("What is prompt?")
print(retrieved_docs)

[Document(metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}, page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word'), Document(metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhu 等 - 2024 - Large Language Models for Information Retrieval A Survey.txt'}, page_content='Few-shot prompt\nExample 1:\nDocument: …If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ou nce cups of coffee or one 12-ounce cup of coffee. Relevant Query: Is a little caffeine ok during pregnancy?\nExample N:\nDocument: Passiflora herbertiana. A rare passion fruit native to Australia...\nRelevant Query: What fruit is native to Australia?\nExample N +1:\nDocument: {#Document}\nRelevant Query:\nZero-shot prompt\nWrite a Question answered by the given passage.\nPassage: {#Passage}\nQuery:\nBrainstorm prompt\nBr

In [6]:
for docs in retrieved_docs:
    print("="*200)
    print(docs)

page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word' metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}
page_content='Few-shot prompt
Example 1:
Document: …If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ou nce cups of coffee or one 12-ounce cup of coffee. Relevant Query: Is a little caffeine ok during pregnancy?
Example N:
Document: Passiflora herbertiana. A rare passion fruit native to Australia...
Relevant Query: What fruit is native to Australia?
Example N +1:
Document: {#Document}
Relevant Query:
Zero-shot prompt
Write a Question answered by the given passage.
Passage: {#Passage}
Query:
Brainstorm prompt
Brainstorm a list of potentially useful text retrieval tasks. Please adhere to the following guidelines: - Specify what the query is, and what the desired documents are.
-	E

## 4.2运行方法2

In [12]:
query = "What is prompt?"
retrieved_docs = db.similarity_search(query)
print(retrieved_docs)

[Document(metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}, page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word'), Document(metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}, page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word'), Document(metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhu 等 - 2024 - Large Language Models for Information Retrieval A Survey.txt'}, page_content='Few-shot prompt\nExample 1:\nDocument: …If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ou nce cups of coffee or one 12-ounce cup of coffee. Relevant Query: Is a little caffeine ok during pregnancy?\nExample N:\nDocument: Passiflora herbe

In [13]:
for docs in retrieved_docs:
    print("="*200)
    print(docs)

page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word' metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}
page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word' metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}
page_content='Few-shot prompt
Example 1:
Document: …If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ou nce cups of coffee or one 12-ounce cup of coffee. Relevant Query: Is a little caffeine ok during pregnancy?
Example N:
Document: Passiflora herbertiana. A rare passion fruit native to Australia...
Relevant Query: What fruit is native to Australia?
Example N +1:
Document: {#Document}
Relevant Query:
Zero-shot prompt
W

## 4.3 运行方法3

In [14]:
import pandas as pd
data = []
query = "What is prompt?"
docs = db.similarity_search_with_score(query, k = len(documents))
for doc in docs:
    metadata = doc[0].metadata
    data.append({"Content": doc[0].page_content, 
                 "Similarity Score": doc[1],
                 "Sourcd Path": metadata.get('source', '')})

    print("="*200)
    print("该文档相似度为:",doc[1])
    print(doc[0])

df = pd.DataFrame(data)
df.to_excel("result.xlsx", index = False)

该文档相似度为: 8397.134765625
page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word' metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}
该文档相似度为: 8397.134765625
page_content='The performance of prompt learning relies on whether the PLMs can fill in the correct label word' metadata={'source': 'd:\\待办\\D40视频\\LLM\\D03Base\\data\\Zhao 等 - Correcting Language Model Bias for Text Classification in True Zero-Shot Learning.txt'}
该文档相似度为: 9335.20703125
page_content='Few-shot prompt
Example 1:
Document: …If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ou nce cups of coffee or one 12-ounce cup of coffee. Relevant Query: Is a little caffeine ok during pregnancy?
Example N:
Document: Passiflora herbertiana. A rare passion fruit native to Australia...
Relevant Query: What fruit is native to Australia?

## 4.4 运行方法4

In [15]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Tongyi

retriever = db.as_retriever()

template = """Answer the question based only on the following context:
{context}

Question:
{question}
"""

prompt = ChatPromptTemplate.from_template(template)

model = Tongyi(model="qwen-turbo", temperature=0.7)

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

chain = (
    {"context":retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [16]:
chain.invoke("What is prompt?")

'Based on the provided context, "prompt" refers to a technique used in machine learning, specifically in the context of prompt learning with Pre-Trained Language Models (PLMs). A prompt is a method that involves framing a task as a prediction problem where the model is given some contextual information (the prompt) and asked to predict a specific output, such as a label or answer to a question. The effectiveness of this approach depends on the model\'s ability to correctly fill in or predict the right label or response based on the given prompt. \n\nHowever, the term "prompt" in your question could also refer to the examples provided for different types of prompts like few-shot, zero-shot, and brainstorm prompts which are used to elicit specific responses from PLMs. These prompts are designed to help the model understand and generate appropriate outputs for various tasks without extensive training or additional data. \n\nSo, in summary, a "prompt" is a form of input designed to guide a