In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader, PDFMinerLoader, UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, CharacterTextSplitter
embedding = HuggingFaceEmbeddings(model_name='bge-large-zh-v1.5')

  embedding = HuggingFaceEmbeddings(model_name='bge-large-zh-v1.5')
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
db = FAISS.load_local('./vectorstore/1', embedding, "1742380748273", allow_dangerous_deserialization=True)
docs = db.similarity_search('唯一的真理是什么', 5)

In [7]:
docs

[Document(id='dcf06aab-8261-485a-b470-0354a8c5a3d8', metadata={'source': 'uploads\\1742380748273.txt'}, page_content='The only thing that words in this world\n这世上唯一有效的真理是\nis that you treat others\n就是以他人对待你的方式\nas they treat you\n对待他人\nThose that has treated me with kindness\n待我以礼者\nI will repay they kindness ten fold\n滴水之恩我以涌泉相报\nAnd those……\n至于那些……\nThey treat me with injustice!\n待我不公之人！\nThey used me\n利用我之人\nThey hurt me down\n伤害我之人\nThey hurt me friends\n伤我友人\nI shall repay that in justice a thousand times over\n此等不公我将以千倍奉还。\n\n\n“太阳所照之处皆为我们的国土。国王的更替就像太阳的升落。有一天，我的时代将随着太阳的落下而逝去，而它将会为你升起，当你成为新的王。”'),
 Document(id='eb635f20-351d-43d9-a531-d4516eda29d5', metadata={'source': 'uploads\\1742380748273.txt'}, page_content='#80000000\n\n机考参考：\n递归：LeetCode70、112、509\n分治：LeetCode23、169、240\n单调栈：LeetCode84、85、739、503\n并查集：LeetCode547、200、684\n滑动窗口：LeetCode209、3、1004、1208\n前缀和：LeetCode724、560、437、1248\n差分：LeetCode1094、121、122\n拓扑排序：LeetCode210\n字符串：LeetCode5、20、43、93\n二分查找：LeetCode33、34\nBFS：Lee

In [None]:
from openai import OpenAI
client = OpenAI(base_url="http://127.0.0.1:9997/v1", api_key="not used actually")

response = client.chat.completions.create(
    model="qwen2.5-instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the largest animal?"}
    ]
)
print(response)

In [1]:
import requests

def test_connection(base_url):
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        print("Connection successful")
    except requests.exceptions.RequestException as e:
        print(f"Connection error: {e}")

if __name__ == "__main__":
    base_url = 'http://127.0.0.1:9997/v1'
    test_connection(base_url)

Connection error: HTTPConnectionPool(host='127.0.0.1', port=9997): Max retries exceeded with url: /v1 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001EB52D58CA0>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝，无法连接。'))


In [None]:
from flask import Flask, request, jsonify
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.graphs import Neo4jGraph
from neo4j import GraphDatabase
import os

class RAG:
    def __init__(self, llm_params, embedding_model, uri, user, password):
        self.app = Flask(__name__)
        self.llm = ChatOpenAI(
            **llm_params
        )
        self.course = None
        self.retriever = None
        self.chain = None
        self.embedding = HuggingFaceEmbeddings(model_name=embedding_model)
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self._setup_routes()

    def _initial_rag(self):
        base_path = f"./vectorstore/{self.course}"
        if os.path.exists(base_path):
            db = None
            for file in os.listdir(base_path):
                if file.endswith(".faiss"):
                    if db is None:
                        db = FAISS.load_local(base_path, self.embedding, file.split('.')[0], allow_dangerous_deserialization=True)
                    else:
                        temp_db = FAISS.load_local(base_path, self.embedding, file.split('.')[0], allow_dangerous_deserialization=True)
                        db.merge_from(temp_db)
            self.retriever = db.as_retriever(
                search_type="similarity_score_threshold", 
                search_kwargs={
                    "k": 4,
                    "score_threshold": 0.2
                })
            template = """
            【指令】根据已知信息，简洁和专业的来回答问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题”，不允许在答案中添加编造成分，答案请使用中文，确保回答准确、完整。
            【已知信息】
            {context}

            【问题】
            {question}
            """
        else:
            self.retriever = None
            template = """
            【指令】答案请使用中文，确保回答准确、完整。
            【问题】
            {question}
            """
        prompt = ChatPromptTemplate.from_template(template)
        self.chain = prompt | self.llm | StrOutputParser()

    def _combine_documents(self, docs):
        combined_docs = []
        for i, doc in enumerate(docs):
            combined_content = f"Document {i+1}:\n"
            combined_content += f"Content:\n{doc.page_content}\n"
            combined_content += "Metadata:\n"
            for key, value in doc.metadata.items():
                combined_content += f"{key}: {value}\n"
            combined_docs.append(combined_content.strip())
        return "\n\n".join(combined_docs)
    
    def _query_neo4j(self, question):
        with self.driver.session() as session:
            result = session.run("MATCH (n) WHERE n.name CONTAINS $question RETURN n", question=question)
            nodes = result.data()
            return nodes

    def _setup_routes(self):
        @self.app.route("/ask", methods=["POST"])
        def ask():
            data = request.json
            course = data.get("course")
            question = data.get("question")
            if not question:
                return jsonify({"error": "Question is required"}), 400
            try:
                if not self.course or self.course != course:
                    self.course = course
                    self._initial_rag()
                context = ""
                if self.retriever:
                    context = self._combine_documents(self.retriever.invoke(question))
                neo4j_context = self._query_neo4j(question)
                result = self.chain.invoke({
                    "question": question,
                    "context": context
                })
                return jsonify({
                    "answer": result,
                    "context": context
                })
            except Exception as e:
                print(f"处理请求时出错: {str(e)}")
                return jsonify({"error": str(e)}), 500

    def run(self, host="0.0.0.0", port=5000):
        self.app.run(host=host, port=port)

if __name__ == "__main__":
    llm_params = {
        "model":'deepseek-chat',
        "openai_api_key":"sk-d1d1f9c2fa7547089a6e67912316bc3f",
        "openai_api_base":'https://api.deepseek.com',
        "max_tokens":1024
    }
    embedding_model = 'bge-large-zh-v1.5'
    uri = "bolt://localhost:7687"
    username = "neo4j"
    password = "ytt252011"

    service = RAG(llm_params, embedding_model, uri, username, password)
    service.run()

In [None]:
from langchain.prompts import (
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
import json
from langchain.llms import Tongyi

llm = Tongyi()

# Extract entities from text 定义抽取实体的类
class Entities(BaseModel):
    """Identifying information about entities."""
    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

    @classmethod
    def from_json(cls, json_data: dict):
        names = []
        if 'entities' in json_data:
            for entity in json_data['entities']:
                names.append(entity['entity'])
        return cls(names=names)

entity_query = "Where did Marie Curie work?"
# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Entities)

# 更新 ChatPromptTemplate 以包含新的提示词
chat_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}. Please ensure the output is in valid JSON format.",
        ),
    ]
)

# 创建提示模板
prompt_template = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# 格式化输入
_input = chat_prompt.format_prompt(question=entity_query)

# 获取 LLM 输出
output = llm(_input.to_string())

# 解析输出
parsed_output = json.loads(output)

# 使用自定义的 from_json 方法解析输出
entities = Entities.from_json(parsed_output)

# 获取 names 列表
names = entities.names
print(names)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings
import requests
embedding = HuggingFaceEmbeddings(model_name="bge-large-zh-v1.5")
api_url = "http://localhost:8001/api/questions"

  embedding = HuggingFaceEmbeddings(model_name="bge-large-zh-v1.5")
  from .autonotebook import tqdm as notebook_tqdm


获取到的问题：
{'questionid': 5, 'studentid': 2021102549, 'courseid': 1, 'title': '如何理解LL(1)文法的预测分析表构建？', 'content': '在学习编译原理时，对LL(1)文法的预测分析表构建步骤不太清楚，特别是First集和Follow集的计算部分，希望能得到详细的解释。', 'tags': ['作业疑问', '概念理解'], 'images': ['/uploads/1739954446173.jpg'], 'status': 'open', 'pendingTime': '2025-03-02T16:07:29.079Z', 'createdAt': '2025-02-19T08:40:46.338Z', 'updatedAt': '2025-03-02T17:35:48.901Z', 'Student': {'username': '颜婷婷', 'nickname': 'QWQ', 'avatar': '/uploads/1740848107838.jpg'}, 'Course': {'coursename': '编译原理', 'teacherid': 1}}
{'questionid': 13, 'studentid': 2021102549, 'courseid': 1, 'title': 'test', 'content': 'test', 'tags': ['编程问题'], 'images': [], 'status': 'locked', 'pendingTime': None, 'createdAt': '2025-02-28T16:17:30.368Z', 'updatedAt': '2025-03-02T17:33:36.625Z', 'Student': {'username': '颜婷婷', 'nickname': 'QWQ', 'avatar': '/uploads/1740848107838.jpg'}, 'Course': {'coursename': '编译原理', 'teacherid': 1}}
推荐的问题：
问题 1: 在学习编译原理时，对LL(1)文法的预测分析表构建步骤不太清楚，特别是First集和Follow集的计算部分，希望能得到详细的解

In [6]:
def fetch_related_questions(course_id, current_question):
    params = {"courseIds": [course_id]}
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        questions = response.json()
        current_embedding = embedding.embed_query(current_question)
        question_similarities = []
        for question in questions:
            combined_text = f"{question['title']} {question['content']}"
            question_embedding = embedding.embed_query(combined_text)
            similarity = cosine_similarity([current_embedding], [question_embedding])[0][0]
            question_similarities.append((question, similarity))
        sorted_questions = sorted(
            question_similarities, key=lambda x: x[1], reverse=True
        )[:3]
        return [
            {
                "title": q["title"],
                "content": q["content"],
                "similarity": sim,
            }
            for q, sim in sorted_questions if sim > 0.5
        ]
    except requests.RequestException as e:
        print(f"Error fetching related questions: {e}")
        return []

# 示例调用
course_id = 1
current_question = "LL（1）文法是指什么？"
similar_questions = fetch_related_questions(course_id, current_question)

# 打印结果
for i, q in enumerate(similar_questions, 1):
    print(f"问题 {i}: {q['title']} - {q['content']} (相似度: {q['similarity']:.2f})")

问题 1: 如何理解LL(1)文法的预测分析表构建？ - 在学习编译原理时，对LL(1)文法的预测分析表构建步骤不太清楚，特别是First集和Follow集的计算部分，希望能得到详细的解释。 (相似度: 0.64)
