In [None]:
# Import necessary libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
import textract
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import ipywidgets as widgets

In [None]:
#This function uses PyPDFLoader to load a PDF file from the specified path and splits it into pages. It's useful for processing PDFs page by page.
def load_and_split_pdf(file_path):
    loader = PyPDFLoader(file_path)
    return loader.load_and_split()

In [None]:
#This function extracts text from a PDF file using textract, saves it to a text file, and then reads it back into a string. It's an alternate method for handling PDFs, especially when dealing with large text content.
def process_text_from_pdf(file_path):
    doc = textract.process(file_path)
    with open('polish-recipes.txt', 'w') as f:
        f.write(doc.decode('utf-8'))
    with open('polish-recipes.txt', 'r') as f:
        return f.read()

In [None]:
#This function uses GPT2TokenizerFast to tokenize the given text and returns the number of tokens. It's important for understanding the size of the text in terms of tokens, which is a common measure in NLP.
def count_tokens(text):
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    return len(tokenizer.encode(text))


In [None]:
#Here, the text is split into smaller chunks using a RecursiveCharacterTextSplitter. This is useful for processing large texts in manageable sizes, especially when working with language models.
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=24, length_function=count_tokens)
    return text_splitter.create_documents([text])


In [None]:
#This function visualizes the distribution of token counts across the chunks of text. It helps in understanding how the text is distributed and is useful for debugging and analysis.
def visualize_data(chunks):
    token_counts = [count_tokens(chunk.page_content) for chunk in chunks]
    df = pd.DataFrame({'Token Count': token_counts})
    df.hist(bins=19)
    plt.show()

In [None]:
#This function generates embeddings for each text chunk using OpenAIEmbeddings and stores them in a FAISS vector database. It's crucial for efficient similarity searches in large text corpora.
def create_embeddings_and_database(chunks):
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(chunks, embeddings)
    return db

In [None]:
#This function initializes and displays an interactive chatbot interface within a Jupyter environment. It utilizes a conversational retrieval chain to generate responses based on the user's input and the document's content.
def create_chatbot_interface(db):
    qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.1), db.as_retriever())
    chat_history = []

    def on_submit(change):
        query = input_box.value
        input_box.value = ""

        if query.lower() == 'exit':
            print("Exiting chatbot.")
            return

        result = qa({"question": query, "chat_history": chat_history})
        chat_history.append((query, result['answer']))
        display(widgets.HTML(f'<b>User:</b> {query}'))
        display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))

    input_box = widgets.Text(placeholder='Enter your question:')
    input_box.on_submit(on_submit)
    display(input_box)


In [None]:
#This section is the entry point of your script. It orchestrates the loading of the PDF, processing the text, and starting the chatbot interface.
if __name__ == "__main__":
    # Set your OpenAI API key as an environment variable for security
    os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

    # Path to your PDF file
    pdf_file_path = "./your-pdf-file.pdf"

    # Choose either simple or advanced method for processing the PDF
    # Simple Method: Load and split the PDF
    pages = load_and_split_pdf(pdf_file_path)

    # Advanced Method: Process text from PDF and split into chunks
    # Uncomment these lines if you want to use the advanced method
    # text = process_text_from_pdf(pdf_file_path)
    # chunks = split_text(text)

    # Optional: Visualize data
    # Uncomment to visualize data
    # visualize_data(chunks)

    # Create embeddings and database from chunks
    # Uncomment and modify these lines if you're using the advanced method
    # db = create_embeddings_and_database(chunks)

    # Start the chatbot interface
    # Uncomment and modify this line if you're using the advanced method
    # create_chatbot_interface(db)
