**The Objective of the Project is to develop an app where user can enter a pdf and perform following operations in their preferred Language:-**


**1. Generate PDF Overview in terms of Course Outline.**

**2. Generate Questions from PDF in terms of MCQ, Short Answer Questions and Fill in the Blanks.**

**3. Chat with PDF.**

# **1. Installing Packages**

In [None]:
!pip install PyPDF2
!pip install faiss-cpu
!pip install transformers
!pip install sentence_transformers
!pip install google-generativeai

# **2. VidyaVistaar**

In [3]:
# Importing Libraries
import faiss
import time
import PyPDF2
import requests
import numpy as np
import pandas as pd
import multiprocessing
from io import BytesIO
from google.colab import userdata
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import warnings
warnings.filterwarnings("ignore")


# Configure the Embedding Model and FAISS for Vector-based Retrieval
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.IndexFlatL2(384)
metadata = {}


# Configure the Google Generative AI API
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')


# Extract texts from the pdf
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text


# Splits text into sentence-aware chunks with overlap.
def split_text_into_chunks(text, max_chunk_size=2000, overlap=50):
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = current_chunk[-overlap:] + sentence
        else:
            if current_chunk:
                current_chunk += ". " + sentence
            else:
                current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk)

    return chunks


# Generates embeddings for text chunks and stores them in FAISS.
def generate_and_store_embeddings(text, doc_id="doc1"):
  chunks = split_text_into_chunks(text)
  embeddings = embedding_model.encode(chunks)
  embeddings = np.array(embeddings).astype(np.float32)
  index.add(embeddings)  # Add embeddings to FAISS index

  # Store metadata for each embedding
  for i, chunk in enumerate(chunks):
    metadata[len(metadata)] = {"chunk": chunk, "doc_id": doc_id}


# Retrieves the top-k most relevant chunks for a query.
def retrieve_chunks(query, top_k=5):
    try:
        query_embedding = embedding_model.encode([query])
        query_embedding = np.array(query_embedding).astype(np.float32)

        # Debugging: Check if the query embedding has the correct shape
        # print(f"Query embedding shape: {query_embedding.shape}")

        # Debugging: Print the current number of items in the FAISS index
        # print(f"FAISS index contains {index.ntotal} embeddings.")

        distances, indices = index.search(query_embedding, top_k)

        # Debugging: Print distances and indices
        # print(f"Distances: {distances}")
        # print(f"Indices: {indices}")

        # Filter out invalid indices (i.e., those equal to -1) and corresponding distances
        valid_indices = [i for i, idx in enumerate(indices[0]) if idx != -1]
        valid_distances = [distances[0][i] for i in valid_indices]
        valid_indices = [indices[0][i] for i in valid_indices]

        if len(valid_indices) == 0:
            raise ValueError("No relevant chunks found for the query.")

        # Fetch the corresponding chunks from metadata
        results = [{"chunk": metadata[idx]["chunk"], "doc_id": metadata[idx]["doc_id"]} for idx in valid_indices]
        return results
    except Exception as e:
        print(f"Error during chunk retrieval: {e}")
        return []


# Generate Course with RAG
def generate_rescontent(query, retrieved_chunks, prompt):
    """
    This function generates a response using RAG. It retrieves relevant chunks from the document and generates a structured response.
    task_type specifies what type of content to generate (e.g., course, MCQ).
    """
    if not retrieved_chunks:
        return "Sorry, I couldn't find any relevant information for your request."

    # Prepare the context by joining relevant chunks
    retrieved_context = " ".join([chunk["chunk"] for chunk in retrieved_chunks])
    full_prompt = f"{prompt}\n\n{retrieved_context}\n\nGenerated Content:"

    # Generate content using the generative model
    response = model.generate_content(full_prompt)
    return response.text.strip()


# Generates a response For Chatbot using Google Gemini.
def generate_response(query, retrieved_chunks):
    if not retrieved_chunks:
      return "Sorry, I couldn't find any relevant information for your question."

    retrieved_context = " ".join([chunk["chunk"] for chunk in retrieved_chunks])
    prompt = f"Context: {retrieved_context}\n\nQuestion: {query}\n\nAnswer:"

    response = model.generate_content(prompt)
    return response.text.strip()


# Translate the response before giving the Output.
def translate_text(text, language_name):
    prompt = f"Translate the following text to {language_name}: {text}"
    response = model.generate_content(prompt)
    if response.parts:
        return response.text.strip()
    else:
        raise ValueError("No valid parts in response.")


# Various Prompts for Various Tasks.
def create_prompts(text, task_type):
    prompts = {
        "mcq": f"Read the following text carefully and generate multiple-choice questions. Each question should include:\n"
               f"1. A clear and concise question based on the text.\n"
               f"2. Give a question with Four options (A, B, C, D), with one correct answer clearly indicated.\n"
               f"3. The questions should cover key concepts, definitions, critical points, and significant details discussed in the text.\n"
               f"4. Ensure the options are plausible and relevant to the content.\n\n"
               f"Text:\n{text}\n\nMCQ:",
        "fill_in_the_blank": f"Read the following text thoroughly and generate fill-in-the-blank questions. Each question should include:\n"
                            f"1. A sentence from the text with one key term or concept replaced by a blank.\n"
                            f"2. The correct term or concept that completes the sentence accurately.\n"
                            f"3. Focus on important information, such as key terms, dates, names, and concepts that are critical to understanding the text.\n\n"
                            f"Text:\n{text}\n\nFill in the blank:",
        "short_answer": f"Read the following text attentively and generate short answer questions. Each question should include:\n"
                        f"1. A clear and specific question that requires a brief response.\n"
                        f"2. The response should address key points, explanations, or definitions provided in the text.\n"
                        f"3. Ensure the questions encourage critical thinking and comprehension of the material, focusing on important details and concepts.\n\n"
                        f"Text:\n{text}\n\nShort answer question:",
        "course": f"Read the following text and generate a comprehensive, structured curriculum content. The content should include:\n"
                  f"1. Learning objectives and outcomes.\n"
                  f"2. Topic-wise breakdown with detailed descriptions.\n"
                  f"3. Key concepts, definitions, and explanations.\n"
                  f"4. Examples, illustrations, and case studies.\n"
                  f"5. Assessment and evaluation criteria.\n\n"
                  f"Text:\n{text}\n\nCurriculum Content:",
    }
    return prompts.get(task_type, "")


# Get User Input from User.
def get_user_input(prompt):
    return input(prompt)


# Chatbot Functionality.
def chatbot():
    print("Welcome to the AnantaLearn Chatbot!!!!")
    pdf_path = get_user_input("Enter the path to your PDF file: ")
    try:
      pdf_text = extract_text_from_pdf(pdf_path)
      generate_and_store_embeddings(pdf_text, doc_id="pdf_doc")
      print(f"PDF loaded and processed successfully. You can now ask questions related to the document.")
    except Exception as e:
      print(f"Error loading PDF: {e}")
      return

    languages = [
        "Arabic", "Czech", "German", "English", "Spanish", "Estonian", "Finnish", "French", "Gujarati",
        "Hindi", "Italian", "Japanese", "Kazakh", "Korean", "Lithuanian", "Latvian", "Burmese", "Nepali",
        "Dutch", "Romanian", "Russian", "Sinhala", "Turkish", "Vietnamese", "Chinese", "Afrikaans",
        "Azerbaijani", "Bengali", "Persian", "Hebrew", "Croatian", "Indonesian", "Georgian", "Khmer",
        "Macedonian", "Malayalam", "Mongolian", "Marathi", "Polish", "Pashto", "Portuguese", "Swedish",
        "Swahili", "Tamil", "Telugu", "Thai", "Tagalog", "Ukrainian", "Urdu", "Xhosa", "Galician",
        "Slovene"
    ]

    print("Available languages:")
    for language in languages:
        print(language)

    while True:
        language_choice = get_user_input("Choose a language for the output: ")
        if language_choice in languages:
            language_name = language_choice
            break
        else:
            print("Invalid choice. Please choose a valid language.")

    text_chunks = split_text_into_chunks(pdf_text)

    while True:
        main_choice = get_user_input("Choose an option: (1) Generate Course, (2) Generate Questions, (3) Chat with PDF, (4) Exit: ").lower()
        if main_choice not in ['1', '2', '3', '4']:
            print("Invalid choice. Please choose either '1', '2', '3', or '4'.")
            continue

        if main_choice == '1':    # Option 1: Generate Course
            print("Generating the course using Retrieval-Augmented Generation (RAG)...")
            prompt = create_prompts(pdf_text, "course")
            retrieved_chunks = retrieve_chunks("Generate a comprehensive course outline based on the following content.", top_k=5)
            course_content = generate_rescontent("Generate a course based on the content.", retrieved_chunks, prompt)
            translated_course_content = translate_text(course_content, language_name)
            print("Generated Course Content:")
            print(translated_course_content)

        elif main_choice == '2':   # Option 2: Generate Questions
            previous_question_type = None
            while True:
                question_type = get_user_input("Choose the type of questions to generate (mcq, fill_in_the_blank, short_answer): ").lower()
                if question_type not in ['mcq', 'fill_in_the_blank', 'short_answer']:
                    print("Invalid choice. Please choose either 'mcq', 'fill_in_the_blank', or 'short_answer'.")
                    continue
                if question_type == previous_question_type:
                    print(f"You've already generated {question_type} questions. Please choose a different type.")
                    continue

                num_questions = int(get_user_input("Enter the number of questions to generate (5, 10, 15): "))
                if num_questions not in [5, 10, 15]:
                    print("Invalid number of questions. Please choose either 5, 10, or 15.")
                    continue

                print(f"Generating {question_type.upper()} questions using Retrieval-Augmented Generation (RAG)...")
                questions = []
                # retrieved_chunks = retrieve_chunks(f"Generate {num_questions} {question_type} questions based on the content.", top_k=5)
                retrieved_chunks = retrieve_chunks(f"Generate {question_type} questions based on the content.", top_k=5)
                prompt = create_prompts(" ".join([chunk["chunk"] for chunk in retrieved_chunks]), question_type)


                for i in range(num_questions):
                  question = generate_rescontent(f"Generate a {question_type} question based on the context.", retrieved_chunks, prompt)
                  translated_question = translate_text(question, language_name)
                  questions.append(translated_question)

                print(f"Questions in questions array :- {len(questions)}")
                print(f"\n{question_type.upper()} Questions:")
                for idx, question in enumerate(questions, 1):
                  print(f"{idx}. {question}")
                  print()

                another_round = get_user_input("Do you want to generate a different type of questions? (yes/no): ").lower()
                if another_round != 'yes':
                    break
                previous_question_type = question_type

        elif main_choice == '3':  # Option 3: Chat with PDF
            while True:
                user_query = get_user_input("Ask a question about the content of the PDF: ")
                retrieved_chunks = retrieve_chunks(user_query, top_k=5)
                response = generate_response(user_query, retrieved_chunks)
                translated_answer = translate_text(response, language_name)
                print(f"Answer: {translated_answer}")

                another_question = get_user_input("Do you want to ask another question? (yes/no): ").lower()
                if another_question != 'yes':
                    break

        # This exits the main loop.
        elif main_choice == '4':
            print("Exiting the program. Goodbye!")
            break

# Start the chatbot
if __name__ == "__main__":
    chatbot()


Welcome to the AnantaLearn Chatbot!!!!
Enter the path to your PDF file: /content/DevOps Syllabus.pdf
PDF loaded and processed successfully. You can now ask questions related to the document.
Available languages:
Arabic
Czech
German
English
Spanish
Estonian
Finnish
French
Gujarati
Hindi
Italian
Japanese
Kazakh
Korean
Lithuanian
Latvian
Burmese
Nepali
Dutch
Romanian
Russian
Sinhala
Turkish
Vietnamese
Chinese
Afrikaans
Azerbaijani
Bengali
Persian
Hebrew
Croatian
Indonesian
Georgian
Khmer
Macedonian
Malayalam
Mongolian
Marathi
Polish
Pashto
Portuguese
Swedish
Swahili
Tamil
Telugu
Thai
Tagalog
Ukrainian
Urdu
Xhosa
Galician
Slovene
Choose a language for the output: Korean
Choose an option: (1) Generate Course, (2) Generate Questions, (3) Chat with PDF, (4) Exit: 1
Generating the course using Retrieval-Augmented Generation (RAG)...
Generated Course Content:
## DevOps 커리큘럼 내용

**학습 목표 및 결과:**

**목표:**

* DevOps 방법론과 SDLC에서의 적용 이해.
* Docker 컨테이너화와 그 중요성 탐구.

**결과:**

* **CO1:** DevOps 개념과 업무 실행