In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain import OpenAI
import os
import fnmatch

### notes01:
    PDF提取:该脚本使用PyPDF2从提供的PDF中提取文本。
    矢量存储创建:然后使用FAISS创建矢量存储，并通过OpenAI生成嵌入。
    基于rag的检索:使用LangChain的RetrievalQA，它根据特定的查询提取摘要、主体和摘要等部分。

In [8]:
# Initialize the OpenAI API key
os.environ['OPENAI_API_KEY'] = 'your-openai-api-key'

In [None]:
def load_pdf(file_path):
    """Loads PDF content using PyPDFLoader."""
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

In [None]:
def create_vectorstore(documents):
    """Create a vector store using the loaded documents."""
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

In [None]:
def extract_information(query, vectorstore):
    """Use LangChain's RetrievalQA to extract information based on the query."""
    qa_chain = RetrievalQA.from_chain_type(
        llm=OpenAI(),
        retriever=vectorstore.as_retriever()
    )
    result = qa_chain.run(query)
    return result

In [4]:
def find_pdf_files(root_dir):
    """
    该函数接收一个目录路径作为参数，并返回该目录及其子目录中所有PDF文件的绝对路径列表。

    :param root_dir: 包含PDF文件的根目录路径
    :return: PDF文件的绝对路径列表
    """
    pdf_files = []

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in fnmatch.filter(filenames, '*.pdf'):
            pdf_files.append(os.path.join(dirpath, filename))

    return pdf_files

In [None]:
# 该方法用于验证PDF文件数量是否不为0且在100个以内，并抛出异常,从而避免OOM。
def validate_pdf_file(file_path):
    if not file_path:
        raise ValueError("No PDF files found in the directory.")

    if len(file_path) > 100:
        raise ValueError("Too many PDF files found in the directory.")

In [6]:
directory = './ShuLiYou'

# 查找出所有PDF文件
file_path_list = find_pdf_files(directory)

validate_pdf_file(file_path_list)

for file_path in file_path_list:
    print("file_path:", file_path)

    # Step 1: Load PDF using PyPDFLoader
    documents = load_pdf(file_path)

    # Step 2: Create vectorstore for the loaded documents
    vectorstore = create_vectorstore(documents)

    # Step 3: Define the queries for abstract, body, and summary
    queries = {
        "abstract": "Extract the abstract of the document.",
        "body": "Extract the main body of the document.",
        "summary": "Provide a summary of the document."
    }

    # Step 4: Extract information
    extracted_info = {}
    for key, query in queries.items():
        extracted_info[key] = extract_information(query, vectorstore)

    # Print the extracted information
    for section, content in extracted_info.items():
        print(f"\n--- {section.capitalize()} ---\n")
        print(content)
    break

file_path: ./ShuLiYou/Pd/pyrrole/Palladium-Catalyzed Asymmetric Intramolecular Dearomative Heck Reaction of Pyrrole Derivatives.pdf
file_path: ./ShuLiYou/Pd/pyrrole/Palladium(0)-catalyzed intramolecular dearomative arylation of pyrroles.pdf
file_path: ./ShuLiYou/Pd/pyrrole/Highly Regio- and Enantioselective Synthesis of Polysubstituted 2H‑Pyrroles via Pd-Catalyzed Intermolecular Asymmetric Allylic Dearomatization of Pyrroles.pdf
file_path: ./ShuLiYou/Pd/pyrrole/Mechanistic Insights into the Pd-Catalyzed Intermolecular Asymmetric Allylic Dearomatization of Multisubstituted Pyrroles- Understanding the Remarkable Regio- and Enantioselectivity.pdf
file_path: ./ShuLiYou/chiral/naphthol_photo/Asymmetric Dearomatization of b-Naphthols through an Amination Reaction Catalyzed by a Chiral Phosphoric Acid**.pdf
file_path: ./ShuLiYou/chiral/naphthol_photo/Chiral phosphoric acid catalyzed aminative dearomatization of α-naphthols:Michael addition sequence.pdf
file_path: ./ShuLiYou/chiral/naphthol_ph