In [None]:
# Install pdfplumber if it is not already installed
!pip install pdfplumber
!pip install transformers
!pip install sentence-transformers

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pdfplumber
from transformers import T5ForConditionalGeneration, T5Tokenizer
from google.colab import files
import torch
import re
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load T5 model and tokenizer
model_name = 't5-small'  # A small, efficient variant of T5
qa_model = T5ForConditionalGeneration.from_pretrained(model_name)
qa_tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load Sentence Transformer model for embedding-based retrieval
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Upload PDF file
uploaded = files.upload()

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or ""  # Avoid None if no text is found on page
        if not text.strip():
            raise ValueError("Not found.")
        return text
    except Exception as e:
        return f"Error occured while extracting text from the document : {str(e)}"

# Function to extract title from document
def extract_title(pdf_text):
    try:
        # Look for the title in the first few lines
        title_candidates = pdf_text.split('\n')[:5]  # Check first 5 lines for the title

        # Return the first non-empty line as the title
        for line in title_candidates:
            if line.strip():
                return line.strip()

        # Fallback to searching for "Title:" pattern
        title_pattern = re.search(r"(?i)(?:title[:\s]*)(.*)", pdf_text)
        if title_pattern:
            return title_pattern.group(1).strip()

        return "Title not found."
    except Exception as e:
        return f"Error extracting title: {str(e)}"

# Function to chunk the document text into smaller pieces for efficient searching
def extract_chunks(pdf_text, chunk_size=500):
    sentences = pdf_text.split('\n')
    chunks = []
    current_chunk = []

    for sentence in sentences:
        if sentence.strip():
            current_chunk.append(sentence.strip())
        if len(" ".join(current_chunk).split()) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Function to get the answer from the T5 model based on the relevant chunk
def get_t5_answer(query, context):
    try:
        # Prepare the input for the model
        input_text = f"question: {query} context: {context}"
        inputs = qa_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)

        # Generate the output (answer)
        outputs = qa_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=100)

        # Decode the generated answer
        answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)

        return answer
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Function to extract references from the document
def extract_references(pdf_text):
    try:
        # Look for a "References" or "Citations" section by using regular expressions
        references_section = re.search(r"(References|Citations|Bibliography)[\s\S]*", pdf_text)

        if references_section:
            return references_section.group(0)
        else:
            return "References not found."
    except Exception as e:
        return f"Error extracting references: {str(e)}"

# Function to find the most relevant chunk for a given query using embeddings
def find_relevant_chunk(query, chunks):
    # Encode the query and all document chunks
    query_embedding = embedder.encode([query])
    chunk_embeddings = embedder.encode(chunks)

    # Compute cosine similarity between the query and each chunk
    similarities = cosine_similarity(query_embedding, chunk_embeddings)

    # Find the chunk with the highest similarity
    best_chunk_index = np.argmax(similarities)
    return chunks[best_chunk_index]

# Main chatbot function
def chatbot(pdf_path):
    # Extract text from the provided PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    if pdf_text.startswith("Error"):
        print(pdf_text)  # If there's an error, print it and exit
        return

    print(" Ask your questions now. Type 'exit' to quit.")

    # Extract title
    title = extract_title(pdf_text)
    print(f"Document Title: {title}")  # Display title when chatbot starts

    # Extract chunks of the document
    pdf_chunks = extract_chunks(pdf_text)

    while True:
        query = input("User: ").strip()

        if query.lower() == 'exit':
            print("Exiting!")
            break

        if "references" in query.lower() or "citations" in query.lower():
            # Extract references if the query is about references
            references = extract_references(pdf_text)
            print(f"Bot: {references}")
            continue

        if not query:
            print("Bot: Please enter a valid question.")
            continue

        # Find the most relevant chunk from the document based on the query
        relevant_chunk = find_relevant_chunk(query, pdf_chunks)

        # Get T5's response based on the relevant chunk
        response = get_t5_answer(query, relevant_chunk)

        # If an error occurs in generating the response, handle it gracefully
        if response.startswith("Error"):
            print(f"Bot: {response}")
        else:
            print(f"Bot: {response}")

# Run the chatbot with the uploaded PDF file
pdf_path = 'AI assignment (1).pdf'  # Make sure to use the name of the uploaded file
chatbot(pdf_path)

Saving AI assignment (1).pdf to AI assignment (1) (2).pdf
 Ask your questions now. Type 'exit' to quit.
Document Title: Government College of Technology
Bot: BFS in highly randomized or large mazes
