In [1]:
import os

def load_documents(directory, limit=None):
    documents = []
    filenames = os.listdir(directory) if limit is None else os.listdir(directory)[:limit]
    for filename in filenames:
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r') as f:
                documents.append(f.read())
    return documents

documents = load_documents("/home/eric/Documents/CVC_Internship/data/spdocvqa_ocr_txt", 200)

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig

# Choose your model
model_name = 'gpt2'  # Replace with your chosen model

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the generation configuration
generation_config = GenerationConfig(
    max_length=512,      # Adjust as needed (input + output length)
    max_new_tokens=50,   # Adjust as needed (output length)
    # You can set other generation parameters here
)
model.generation_config = generation_config

# Create a text-generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load an embedding model
embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

# Create the vector store directly from texts and embeddings
vectorstore = FAISS.from_texts(documents, embedding_model)

In [4]:
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain_huggingface import HuggingFacePipeline

# Create an LLM wrapper for LangChain
llm = HuggingFacePipeline(
    pipeline=generator,
    model_kwargs={"max_length": 512, "max_new_tokens": 50}
)

# Create a retriever from the vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

# Create the RAG chain using from_chain_type
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Options: "stuff", "map_reduce", "refine", "map_rerank"
    retriever=retriever
)

In [5]:
documents[100]

'Chapter to be written\nby Dr. Shank\n"NUTRITION PRINCIPLES"\nSource: https://www.industrydocuments.ucsf.edu/docs/mhbf0227\n'

In [6]:
# Your query
query = "Who wrote the chapter of NUTRITION PRINCIPLES?"

# Get the answer from the RAG system
answer = rag_chain.invoke(query)

print("Question:", query)
print("Answer:", answer)

Question: Who wrote the chapter of NUTRITION PRINCIPLES?
Answer: {'query': 'Who wrote the chapter of NUTRITION PRINCIPLES?', 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nChapter to be written\nby Dr. Shank\n"NUTRITION PRINCIPLES"\nSource: https://www.industrydocuments.ucsf.edu/docs/mhbf0227\n\n\nQuestion: Who wrote the chapter of NUTRITION PRINCIPLES?\nHelpful Answer: The book itself does not cover NUTRIATION. The book is written by Dr. Shank, the author of NUTRITION PRINCIPLES, of which I am the publisher. I will post the book on my personal Facebook page.'}
