In [1]:
import os
import sys

sys.path.append('..')

from src.utils.paths import get_project_path

from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig,
    LocalUploaderConfig
)

from unstructured.staging.base import elements_from_json

from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig


from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [213]:
def local_parse_data(directory_with_pdfs: str, directory_with_results: str):
    
    pdf_files = []
    for root, dirs, files in os.walk(directory_with_pdfs):
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))


    for pdf_file in pdf_files:
        Pipeline.from_configs(
            context=ProcessorConfig(),
            indexer_config=LocalIndexerConfig(input_path=pdf_file),
            downloader_config=LocalDownloaderConfig(),
            source_connection_config=LocalConnectionConfig(),
            partitioner_config=PartitionerConfig(
                partition_by_api=True,
                api_key=os.getenv("UNSTRUCTURED_API_KEY"),
                partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
                strategy="hi_res",
                additional_partition_args={
                    "split_pdf_page": True,
                    "split_pdf_concurrency_level": 15,
                },
            ),
            uploader_config=LocalUploaderConfig(output_dir=directory_with_results)
        ).run()

directory_with_pdfs = os.path.join(get_project_path(), 'data')
directory_with_results = os.path.join(get_project_path(), 'data', 'parsed_pages')

# local_parse_data(directory_with_pdfs, directory_with_results)


In [214]:
def load_processed_files(directory_path: str) -> list:
    elements = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                elements.extend(elements_from_json(filename=file_path))
            except IOError:
                print(f"Error: Could not read file {filename}.")

    return elements

elements = load_processed_files(directory_with_results)

In [215]:
documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    documents.append(Document(page_content=element.text, metadata=metadata))

db = FAISS.from_documents(documents, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})

INFO: Use pytorch device_name: cuda
INFO: Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5


In [216]:
# Set up the Ollama model and prompt configuration
llm = Ollama(model="llama3.1:8b")

# Define the prompt template for question answering
prompt_template = """
<|start_header_id|>user<|end_header_id|>
You are an assistant for answering questions based on extracted document sections.
Provide clear and concise answers to the question using the context below.
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

# Create a PromptTemplate instance
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

def format_docs(docs):
    # Format extracted document sections for the context
    return "\n\n".join(doc.page_content for doc in docs)

#Build the RAG chain using the ollama-based LLM
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
        | StrOutputParser()
)


In [217]:
question = input()

In [219]:
print(rag_chain.invoke(question))

Hello! It seems like we're starting with a greeting. What would you like to talk about?


In [None]:
for filename1 in os.listdir(os.path.join(get_project_path(), 'data', 'Оригинальные статьи')):
    print(filename1)
    for filename2 in os.listdir(os.path.join(get_project_path(), 'data', 'Оригинальные статьи', filename1)):
        if filename2.endswith('.pdf'):
            
        print(filename2)

2PP
2pp_for_biomedicine
materials_for_2pp
structure_created_by_2pp
methods_nanostructure
Conventional photolithography and reactive ion etching
High energy beam processing method
Hot embossing
idk
Interference lithography
Nanoimprinting
Thin film deposition processing technology
ml_and_nanostructure
2018_Generative Model for the Inverse Design of Metasurfaces.pdf
2018_Plasmonic nanostructure design and characterization via Deep Learning.pdf
2019_Deep learning for accelerated all-dielectric metasurface design.pdf
2019_Optimisation of colour generation from dielectric nanostructures using reinforcement learning.pdf
2019_Training artificial neural network for optimization of nanostructured VO2-based smart window performance.pdf
2020_Machine-Learning-Guided Morphology Engineering of Nanoscale Metal-Organic Frameworks.pdf
2020_Multitask deep-learning-based design of chiral plasmonic metamaterials.pdf
2020_Nanomaterial Synthesis Insights from Machine Learning of Scientific Articles by Extrac