# **Steps to Build the Application**

## **Step 1: Install Required Libraries**

In [1]:
pip install PyMuPDF faiss-cpu sentence-transformers transformers streamlit

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.12-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting streamlit
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<6,>=2.1.5 (from streamlit)
  Downloading watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Downloading PyMuPDF-1.24.12-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m27.3 MB/s

## **Step 2: Extract Text from PDFs**

In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

## **Step 3: Load Pre-Trained HuggingFace Model for Question Generation**

In [3]:
from transformers import pipeline

# Load a question generation pipeline
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

def generate_questions(text):
    questions = question_generator(text)
    return questions

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/15.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



## **Step 4: Generate Embeddings for Text**

In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load a sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def get_text_embedding(text):
    return embedder.encode([text])[0]

# Store embeddings in FAISS for retrieval
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(embeddings)
    return faiss_index

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## **Step 5: Use the FAISS index to retrieve relevant sections based on user input.**


In [6]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer  # Import SentenceTransformer

# Assuming extracted_texts is defined somewhere, replace with your actual text data
extracted_texts = ["This is the first section.", "This is the second section about machine learning.", "This is the third section."]

# Load a sentence transformer model (if not already loaded)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each text section
text_embeddings = [(section, embedder.encode([section])) for section in extracted_texts]

# Create and populate FAISS index
def create_faiss_index(embedding_vectors):
    dimension = embedding_vectors[0][1].shape[1]  # Embedding dimension, access from tuple
    faiss_index = faiss.IndexFlatL2(dimension)

    for sections, embeddings in embedding_vectors:
        faiss_index.add(np.array(embeddings))  # Add embeddings to the index

    return faiss_index

# Retrieve the top-N relevant sections based on query
def retrieve_relevant_sections(faiss_index, query, all_sections, top_k=3):
    query_embedding = embedder.encode([query])
    distances, indices = faiss_index.search(query_embedding, top_k)

    relevant_sections = []
    for idx in indices[0]:
        section = all_sections[idx]  # Get the corresponding section text
        relevant_sections.append(section)

    return relevant_sections

# Example: Create FAISS index and retrieve relevant sections
faiss_index = create_faiss_index(text_embeddings)
query = "Explain machine learning"
relevant_sections = retrieve_relevant_sections(faiss_index, query, extracted_texts)
print(relevant_sections)



['This is the second section about machine learning.', 'This is the first section.', 'This is the third section.']


## **Step 6: Use the HuggingFace model to generate questions based on retrieved text.**


In [7]:
from transformers import pipeline

# Load the question generation model from HuggingFace
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

# Function to generate questions based on relevant sections
def generate_questions_from_sections(sections):
    questions = []
    for section in sections:
        question = question_generator(f"generate question: {section}")
        questions.append(question[0]['generated_text'])  # Extract the generated text
    return questions

# Example: Generate questions from relevant sections
generated_questions = generate_questions_from_sections(relevant_sections)

# Output the generated questions
for i, question in enumerate(generated_questions, 1):
    print(f"Question {i}: {question}")




Question 1: What is the second section about machine learning?
Question 2: What is the first section of the book?
Question 3: What is the third section of the book?


## **Step 7: Create the Frontend Using Streamlit**

In [8]:
import streamlit as st

st.title("RAG-based Question Generator")

uploaded_files = st.file_uploader("Upload PDFs", accept_multiple_files=True)

if uploaded_files:
    for file in uploaded_files:
        text = extract_text_from_pdf(file)
        st.write("Generated Questions:")
        questions = generate_questions(text[:512])  # Truncate to avoid length issue
        for question in questions:
            st.write(question['generated_text'])

2024-10-22 05:47:32.893 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
