In [2]:
from langchain.document_loaders import PyPDFLoader  # EPUB files can be converted to PDFs if needed
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

In [3]:
# If running in Google Colab, you may need to run this cell to make sure you're using UTF-8 locale to install LangChain
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [5]:
!mkdir -p "./test/library/jane-austen"
!mkdir -p "./test/library/victor-hugo"
!wget https://www.gutenberg.org/ebooks/1342.epub.noimages -O "./test/library/jane-austen/pride-and-prejudice.epub"
!wget https://www.gutenberg.org/ebooks/135.epub.noimages -O "./test/library/victor-hugo/les-miserables.epub"

--2024-11-26 21:36:32--  https://www.gutenberg.org/ebooks/1342.epub.noimages
Resolving www.gutenberg.org (www.gutenberg.org)... 2610:28:3090:3000:0:bad:cafe:47, 152.19.134.47
Connecting to www.gutenberg.org (www.gutenberg.org)|2610:28:3090:3000:0:bad:cafe:47|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/1342/pg1342.epub [following]
--2024-11-26 21:36:33--  https://www.gutenberg.org/cache/epub/1342/pg1342.epub
Reusing existing connection to [www.gutenberg.org]:443.
HTTP request sent, awaiting response... 200 OK
Length: 561345 (548K) [application/epub+zip]
Saving to: ‘./test/library/jane-austen/pride-and-prejudice.epub’


2024-11-26 21:36:34 (533 KB/s) - ‘./test/library/jane-austen/pride-and-prejudice.epub’ saved [561345/561345]

--2024-11-26 21:36:34--  https://www.gutenberg.org/ebooks/135.epub.noimages
Resolving www.gutenberg.org (www.gutenberg.org)... 2610:28:3090:3000:0:bad:cafe:47, 152.19.134.47
Connecting to www.

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_text(file_path):
    """Extract plain text content from an EPUB file."""
    book = epub.read_epub(file_path)
    extracted_text = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.content, 'html.parser')
            extracted_text.append(soup.get_text())
    return "\n".join(extracted_text)

# Function to split text into chunks
def split_text_into_chunks(text, size=512, overlap=30):
    """Split the input text into manageable chunks with optional overlap."""
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    return splitter.split_text(text)

# Extract text from EPUB files
pride_and_prejudice_text = extract_text("./test/library/jane-austen/pride-and-prejudice.epub")
les_miserables_text = extract_text("./test/library/victor-hugo/les-miserables.epub")

# Chunk the extracted text
pride_and_prejudice_chunks = split_text_into_chunks(pride_and_prejudice_text)
les_miserables_chunks = split_text_into_chunks(les_miserables_text)

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [10]:
chunked_docs = pride_and_prejudice_chunks + les_miserables_chunks

In [11]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
documents = [Document(page_content=chunk) for chunk in chunked_docs]

# Initialize the HuggingFace Embedding Model (Dense Embedding)
embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')

# Create the FAISS vector store from the documents
db = FAISS.from_documents(documents, embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')


In [12]:
from transformers import pipeline

# Make sure the token is passed correctly
reranker = pipeline("text-classification", model="cross-encoder/nli-deberta-v3-base", truncation=True, padding=True)




In [13]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import AnalyzeDocumentChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from sentence_transformers import CrossEncoder

# Initialize the CrossEncoder for reranking
reranker = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2")

In [14]:
query = "What are the symptoms of diabetes?"
retrieved_docs = db.similarity_search(query, k=2)

In [15]:
retrieved_docs

[Document(metadata={}, page_content='sugar. O nibbling sex, your pretty little white teeth adore sugar. Now,\nheed me well, sugar is a salt. All salts are withering. Sugar is the most\ndesiccating of all salts; it sucks the liquids of the blood through the\nveins; hence the coagulation, and then the solidification of the blood;\nhence tubercles in the lungs, hence death. That is why diabetes borders on\nconsumption. Then, do not crunch sugar, and you will live. I turn to the'),
 Document(metadata={}, page_content='felt. Is any one the less ill because one does not know the name of one’s\nmalady?')]

In [16]:
reranked_results = []
for doc in retrieved_docs:
    # Combine the query and document for relevance scoring
    result = reranker.predict([(query, doc.page_content)])
    score = result[0]  # This should directly give you the score
    reranked_results.append((score, doc.page_content))

In [17]:
reranked_results.sort(key=lambda x: x[0], reverse=True)

# Display re-ranked results
for score, doc in reranked_results:
    print(f"Score: {score:.4f}, Document: {doc}")

Score: 0.0011, Document: sugar. O nibbling sex, your pretty little white teeth adore sugar. Now,
heed me well, sugar is a salt. All salts are withering. Sugar is the most
desiccating of all salts; it sucks the liquids of the blood through the
veins; hence the coagulation, and then the solidification of the blood;
hence tubercles in the lungs, hence death. That is why diabetes borders on
consumption. Then, do not crunch sugar, and you will live. I turn to the
Score: 0.0009, Document: felt. Is any one the less ill because one does not know the name of one’s
malady?


In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = 'HuggingFaceH4/zephyr-7b-beta'


model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 8/8 [00:18<00:00,  2.28s/it]


In [21]:
query = "What is the history of the diabetes?"
context = " ".join([doc for _, doc in reranked_results])



## Setup the LLM chain

Finally, we have all the pieces we need to set up the LLM chain.

First, create a text_generation pipeline using the loaded model and its tokenizer.

Next, create a prompt template - this should follow the format of the model, so if you substitute the model checkpoint, make sure to use the appropriate formatting.

In [22]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = f"""
You are an AI assistant trained to answer questions based solely on provided context. 
Do not invent any information. Only use the following documents to answer the query.
Question: {query}
Context: {context}

Please provide a concise and accurate answer without hallucinations:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

Note: _You can also use `tokenizer.apply_chat_template` to convert a list of messages (as dicts: `{'role': 'user', 'content': '(...)'}`) into a string with the appropriate chat format._


Finally, we need to combine the `llm_chain` with the retriever to create a RAG chain. We pass the original question through to the final generation step, as well as the retrieved context docs:

In [23]:
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever()

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)


## Compare the results

Let's see the difference RAG makes in generating answers to the library-specific questions.

In [24]:
question = "How do you check diabetes?"

First, let's see what kind of answer we can get with just the model itself, no context added:

In [25]:
llm_chain.invoke({"context":"", "question": question})

'\nYou are an AI assistant trained to answer questions based solely on provided context. \nDo not invent any information. Only use the following documents to answer the query.\nQuestion: What is the history of the diabetes?\nContext: sugar. O nibbling sex, your pretty little white teeth adore sugar. Now,\nheed me well, sugar is a salt. All salts are withering. Sugar is the most\ndesiccating of all salts; it sucks the liquids of the blood through the\nveins; hence the coagulation, and then the solidification of the blood;\nhence tubercles in the lungs, hence death. That is why diabetes borders on\nconsumption. Then, do not crunch sugar, and you will live. I turn to the felt. Is any one the less ill because one does not know the name of one’s\nmalady?\n\nPlease provide a concise and accurate answer without hallucinations:\n\nDiabetes is a condition characterized by high levels of sugar (glucose) in the blood due to either insufficient production or resistance to the effects of insulin, a

As you can see, the model interpreted the question as one about physical computer adapters, while in the context of PEFT, "adapters" refer to LoRA adapters.
Let's see if adding context from GitHub issues helps the model give a more relevant answer: