## Necessary Imports

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from transformers import pipeline
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
import os
from groq import Groq
from langchain_groq import ChatGroq
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
import ipywidgets as widgets
from IPython.display import display, HTML

#### This is the simple RAG framework and for our purposes we will break this down into a three step process: indexing, retrieval and generation.

## Indexing

### Loading Documents
The project uses the following books as the primary data sources for retrieval and generation:
1. "Verity" by Colleen Hoover
    - Genre: Psychological Thriller

    - Description: A gripping novel about a struggling writer, Lowen Ashleigh, who is hired to complete the remaining books in a successful series by the injured author, Verity Crawford. As Lowen works on the manuscripts, she uncovers dark secrets about Verity's life.

    - Use Case: The book's complex narrative and character dynamics make it an excellent source for testing retrieval and generation capabilities.

2. "The Girl on the Train" by Paula Hawkins
    - Genre: Mystery, Thriller

    - Description: A suspenseful story about Rachel, a woman who becomes entangled in a missing persons investigation that she observes during her daily train commute. The novel explores themes of memory, truth, and deception.

    - Use Case: The intricate plot and unreliable narration provide rich content for testing advanced RAG techniques.

In [None]:
verity_book = PyMuPDFLoader("./Dataset/Verity-By-Colleen-Hoover.pdf")
tgott_book = PyMuPDFLoader("./Dataset/The-Girl-on-the-Train.pdf")

In [None]:
verity_book_pages = verity_book.load()
tgott_book_pages = tgott_book.load()

In [None]:
def is_page_empty(page, min_text_length=0):
    """
    Check if a page is empty or contains very little text.
    
    Args:
        page: The page object loaded by PyMuPDFLoader.
        min_text_length: Minimum number of characters to consider a page non-empty.
    
    Returns:
        bool: True if the page is empty, False otherwise.
    """
    return len(page.page_content.strip()) <= min_text_length

In [None]:
# Filter out empty pages from Verity book
verity_book_pages_filtered = [page for page in verity_book_pages if not is_page_empty(page)]

# Filter out empty pages from The Girl on the Train book
tgott_book_pages_filtered = [page for page in tgott_book_pages if not is_page_empty(page)]

In [None]:
documents = verity_book_pages_filtered + tgott_book_pages_filtered

In [None]:
print(documents[0])

### Text Splitting using Recursive Character Text Splitter 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

In [None]:
texts = text_splitter.split_documents(documents)

### Creating Embedding and Storing Text to Vector DB

In [None]:
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Create vector store
vectorstore = FAISS.from_documents(texts, embeddings)

## Retrieval

In [None]:
# Load the API key from .env file
load_dotenv()
chatgroq_api_key = os.getenv("GROQ_API_KEY")

In [None]:
# Initialize the ChatGroq model
chatgroq_model = ChatGroq(temperature=0,
                      model_name="mixtral-8x7b-32768",
                      api_key=chatgroq_api_key)

In [None]:
# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=chatgroq_model,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
    return_source_documents=True,
    # verbose=True
)

## Generation

In [None]:
# Define the chatbot response function
def chatbot_response(user_input):
    result = qa_chain({"query": user_input})
    return f"{result['result']}"

# Create the chatbot UI
# Text input for user messages
user_input = widgets.Text(
    placeholder="Type your message here...",
    description="You:",
    layout=widgets.Layout(width="80%")
)

# Button to submit messages
submit_button = widgets.Button(
    description="Send",
    button_style="success"
)

# Output area for the conversation
output = widgets.Output(
    layout=widgets.Layout(),
    style={"description_width": "initial"}
)

# Function to handle button click
def on_submit_button_click(b):
    with output:
        user_message = user_input.value
        if user_message.strip():  # Check if the input is not empty
            # Display the user's message
            display(HTML(f"<strong>You:</strong> {user_message}"))
            
            # Get the chatbot's response
            bot_response = chatbot_response(user_message)
            
            # Display the bot's response
            display(HTML(f"<strong>Bot:</strong> {bot_response} <br>"))
            display(HTML("<br>"))
            # Clear the input box
            user_input.value = ""
        else:
            display(HTML("<em>Please enter a message.</em>"))

# Attach the function to the button's click event
submit_button.on_click(on_submit_button_click)

# Arrange the widgets vertically
chatbot_ui = widgets.VBox([user_input, submit_button, output])

# Display the chatbot UI
display(chatbot_ui)