In [2]:
!pip install -q langchain langchain_community langchain_chroma
!pip install -q --disable-pip-version-check -U langchain-openai
!pip install -q openai gradio PyPDF2


In [1]:
#Making the .gitignore file
gitignore_content = """# Ignore Python compiled files
__pycache__/
*.py[cod]

# Ignore Jupyter Notebook checkpoints
.ipynb_checkpoints/

# Ignore environment files
.env
"""

# Creating the .gitignore file and writing to it
with open('.gitignore', 'w') as f:
    f.write(gitignore_content)

print(".gitignore file created successfully!")


.gitignore file created successfully!


In [3]:
import os
import gradio as gr
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import bs4
import PyPDF2
import time
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
# Setting user agent 
os.environ['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'

# Function to set API keys based on the user's input
def set_keys():
    os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API Key: ")
    os.environ["OPENAI_PROJECT_KEY"] = input("Enter your OpenAI Project Key: ")
    os.environ["LANGCHAIN_API_KEY"] = input("Enter your LangChain API Key: ")
    print("API keys set successfully!")

set_keys()


Enter your OpenAI API Key:  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
Enter your OpenAI Project Key:  proj_RSGDjP6qRwU5ZuTeUUXh2RUf
Enter your LangChain API Key:  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


API keys set successfully!


In [5]:
# Helper function to load documents from different sources (PDF, URL, or a text file)
def load_document(source, is_url=False):
    if is_url:
        # Load from a URL
        loader = WebBaseLoader(
            web_paths=(source,),
            bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=("post-content", "post-title", "post-header")))
        )
        docs = loader.load()
    else:
        # Load from a PDF or text file
        if source.name.endswith(".pdf"):
            reader = PyPDF2.PdfReader(source)
            text = ""
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
            docs = [Document(page_content=text)]  # Convert to LangChain Document format
        else:
            text = source.read().decode("utf-8")  # Reading a text file
            docs = [Document(page_content=text)]
    
    return docs


In [6]:
# Function to create embeddings with retry logic
def create_embeddings_with_retry(documents, max_retries=3, wait_time=5):
    try:
        if "OPENAI_API_KEY" not in os.environ:
            raise ValueError("OpenAI API key is missing!")
        
        embedding = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
        
        for attempt in range(max_retries):
            try:
                vectorstore = Chroma.from_documents(documents=documents, embedding=embedding)
                return vectorstore
            except Exception as e:
                print(f"Error: {e}")
                if "Rate limit exceeded" in str(e):
                    print(f"Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries})")
                    time.sleep(wait_time)
                else:
                    return None
    except Exception as e:
        print(f"Critical Error: {e}")
        return None


In [7]:
# Function to process the uploaded file or link, create embeddings, and set up retriever
def process_and_create_retriever(uploaded_file, link):
    # Loading document based on user input
    if uploaded_file is not None:
        docs = load_document(uploaded_file)
    elif link is not None and len(link.strip()) > 0:
        docs = load_document(link, is_url=True)
    else:
        return None
    
    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    # Create embeddings
    vectorstore = create_embeddings_with_retry(splits)
    
    if vectorstore:
        return vectorstore.as_retriever()
    else:
        return None


In [8]:
# Function to format the documents for output
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [9]:
# Gradio Interface for asking questions
def ask_question(question, retriever):
    if retriever is None:
        return "Error: No document uploaded or link provided."
    
    prompt = hub.pull("rlm/rag-prompt")

    # Define the RAG chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | ChatOpenAI(model="gpt-4o-mini")
        | StrOutputParser()
    )
    
    # Getting a response
    response = rag_chain.invoke(question)
    return f"Response: {response}"


In [10]:
# Gradio UI for asking questions
def gradio_interface():
    with gr.Blocks() as question_ui:
        # Add the inputs for file and link
        with gr.Row():
            uploaded_file = gr.File(label="Upload a Document (PDF/Text)")
            link_input = gr.Textbox(label="Or You can enter a URL", placeholder="https://example.com")

        question_input = gr.Textbox(label="Ask a Question")
        ask_button = gr.Button("Submit")
        answer_output = gr.Textbox(label="Answer")

        # Logic for handling the user input
        def on_submit(question, uploaded_file, link_input):
            retriever = process_and_create_retriever(uploaded_file, link_input)
            return ask_question(question, retriever)

        ask_button.click(fn=on_submit, inputs=[question_input, uploaded_file, link_input], outputs=answer_output)

    return question_ui


In [11]:
# Launch the Gradio UI in a separate window
gradio_interface().launch(server_name="0.0.0.0", server_port=7860)


* Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.


