In [2]:
import pdfplumber
import pdfplumber
import os
import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import DirectoryLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub



In [3]:
class PDFLoader:
    def __init__(self, directory, glob_pattern="*.pdf"):
        self.directory = directory
        self.glob_pattern = glob_pattern

    def load(self):
        documents = []
        for pdf_path in glob.glob(os.path.join(self.directory, self.glob_pattern)):
            with pdfplumber.open(pdf_path) as pdf:
                full_text = " ".join(page.extract_text() or '' for page in pdf.pages)
                documents.append(full_text)
        return documents


In [4]:
loader = PDFLoader('./Input_text', glob_pattern="*.pdf")
documents = loader.load()

In [6]:
class Document:
    def __init__(self, text):
        self.page_content = text  # Mimicking the expected attribute
        self.metadata = {}  # Assuming metadata is also expected

# Wrap your extracted texts into Document instances
documents = [Document(text) for text in loader.load()]

# Now, use the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)


In [7]:
vectorstore = Chroma.from_documents(chunks, OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6}) 


In [8]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6}) 


In [9]:
prompt = hub.pull("rlm/rag-prompt")




In [10]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [12]:
rag_chain = ( 
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm 
    | StrOutputParser()
)

In [14]:
result = rag_chain.invoke("What is GEM format?")
print(result)

GEM format is the Gene Expression Matrix file format used to store gene expression data. It includes information such as gene ID, coordinates, MID count, and optional cell ID. GEM files are used in the Stereo-seq analysis workflow to store gene expression matrices.


In [15]:
result = rag_chain.invoke("What is the differences between GEF GEM and RPI files")
print(result)




The GEF file format is designed for multi-dimensional data storage and high computation efficiency, while the GEM file format is used for gene expression matrices. RPI files are not specifically mentioned in the provided context, so the exact differences between GEF GEM and RPI files are not clear.


In [17]:
result = rag_chain.invoke("What is the RPI v0.0.2")
print(result)

The RPI v0.0.2 is an upgraded version of the RPI file format that supports storing and organizing multiple stained microscopy images in groups, with improved tissue recognition accuracy and performance. It also includes bug fixes related to version compatibility and gene expression distribution plot display. The RPI v0.0.2 is part of the STOmics Stereo-seq Analysis Workflow File Format Manual released in March 2023.
