#Develop a RAG-based question-answering system that allows users to upload a document, ask questions about its content, and receive relevant answers.
##API - openai
##Model=gpt**-3.5**-turbo

In [None]:
!pip install openai
!pip install faiss-cpu
!pip install PyPDF2
!pip install ipywidgets
!pip install sentence-transformers
!pip install pdfminer.six
!pip install torch
!pip install openai==0.28

In [None]:
import io
import faiss
import numpy as np
import re
import openai
import ipywidgets as widgets
from IPython.display import display, HTML
from pdfminer.high_level import extract_text
from sentence_transformers import SentenceTransformer

In [3]:
openai.api_key = " "

class DocumentProcessor:
    def __init__(self):
        self.faiss_index = None
        self.chunks = None
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Handle upload
    def handle_upload(self, change):
        for uploaded_filename in change['new']:
            file_info = change['new'][uploaded_filename]
            content = file_info['content']

            if uploaded_filename.endswith('.pdf'):
                text = self.extract_text_from_pdf(content)
            elif uploaded_filename.endswith('.txt'):
                text = self.extract_text_from_txt(content)
            else:
                text = "Unsupported file format. Please upload a PDF or TXT file."

            cleaned_text = self.clean_text(text)
            self.chunks = self.split_into_chunks(cleaned_text)
            embeddings = self.generate_embeddings(self.chunks)
            self.faiss_index = self.store_embeddings_in_faiss(embeddings)

            display(HTML(f"<p>File name: {uploaded_filename} <br> Status: processed<br>Type your queries in the question box and wait for the answer</p>"))
            self.enable_question_input()

    def enable_question_input(self):
        question_input.disabled = False
        submit_button.disabled = False

#Text preprocessing
    def extract_text_from_pdf(self, content):
        text = extract_text(io.BytesIO(content))
        return text

    def extract_text_from_txt(self, content):
        return content.decode('utf-8')

    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text

    def split_into_chunks(self, text, chunk_size=500):
        words = text.split()
        chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
        return chunks

#Embedding generation
    def generate_embeddings(self, text_chunks):
        embeddings = self.model.encode(text_chunks)
        return embeddings

    def store_embeddings_in_faiss(self, embeddings):
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(embeddings))
        return index

#process question
    def process_question(self, question):
        question_embedding = self.model.encode([question])
        _, indices = self.faiss_index.search(np.array(question_embedding), k=5)
        relevant_chunks = [self.chunks[idx] for idx in indices[0]]
        return relevant_chunks

#Generate answer
    def generate_answer(self, question, relevant_chunks):
        prompt = f"Question: {question}\n\n"
        prompt += "Here are some relevant information:\n"
        for chunk in relevant_chunks:
            prompt += f"- {chunk}\n"
        prompt += "\nBased on the above information, please provide a detailed answer."


        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,
            stop=None,
            temperature=0.7
        )

        answer = response.choices[0].message['content']
        return answer

# Instantiate the DocumentProcessor
doc_processor = DocumentProcessor()

# Create UI using widgets
uploader = widgets.FileUpload(
    accept='.pdf,.txt',
    multiple=False
)


uploader.observe(doc_processor.handle_upload, names='value')

display(uploader)

question_input = widgets.Text(
    value='',
    placeholder='Type your question here',
    description='Question:',
    disabled=True
)


submit_button = widgets.Button(
    description='Submit',
    disabled=True,
    button_style='primary'
)


answer_display = widgets.Textarea(
    value='',
    placeholder='The answer will appear here...',
    description='Answer:',
    disabled=True
)

#Handle submit button
def handle_submit(_):
    global doc_processor
    question = question_input.value.strip()

    if doc_processor.faiss_index is not None and doc_processor.chunks is not None:
        relevant_chunks = doc_processor.process_question(question)
        answer = doc_processor.generate_answer(question, relevant_chunks)
        answer_display.value = answer


submit_button.on_click(handle_submit)


display(question_input)
display(submit_button)
display(answer_display)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FileUpload(value={}, accept='.pdf,.txt', description='Upload')

Text(value='', description='Question:', disabled=True, placeholder='Type your question here')

Button(button_style='primary', description='Submit', disabled=True, style=ButtonStyle())

Textarea(value='', description='Answer:', disabled=True, placeholder='The answer will appear here...')

RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.