#  Install Dependencies

In [1]:
!pip install pdfplumber faiss-cpu sentence-transformers
!pip install --quiet transformers


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m41.0/42.0 kB[0m [31m54.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m635.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading 

# **Import Libraries**

In [2]:
import pdfplumber
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


  from tqdm.autonotebook import tqdm, trange


# **Text Extraction from PDF**

In [3]:
from google.colab import files

# Upload PDF
uploaded = files.upload()

# Extract text from PDF
def extract_pdf_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Use the uploaded file name
pdf_file = list(uploaded.keys())[0]
pdf_text = extract_pdf_text(pdf_file)
print(f"Extracted {len(pdf_text)} characters.")


Saving input.pdf to input.pdf
Extracted 95484 characters.


# **Preprocess Extracted Text**

In [4]:
import spacy

# Preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces/newlines
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

# Tokenize text into sentences
sentences = preprocess_text(pdf_text)
print(f"Processed text into {len(sentences)} sentences.")


Processed text into 767 sentences.


# **Generate Sentence Embeddings**

In [5]:
# Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each sentence
embeddings = model.encode(sentences, convert_to_tensor=True)
print(f"Generated embeddings for {len(sentences)} sentences.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated embeddings for 767 sentences.


# **Build a FAISS Search Index**

In [6]:
# Convert embeddings to numpy array
embeddings_np = np.array([emb.numpy() for emb in embeddings])

# Create FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)
print(f"FAISS index created with {index.ntotal} entries.")


FAISS index created with 767 entries.


# **Query Function**

In [7]:
def search_query(query, sentences, index, model, threshold=0.5):
    """Search for the closest sentence to the query."""
    query_embedding = model.encode([query], convert_to_tensor=True).numpy()
    distances, indices = index.search(query_embedding, k=1)
    if distances[0][0] > threshold:
        return None  # Fallback if no close match
    return sentences[indices[0][0]]


# **Interactive Chatbot**

In [9]:
# Interactive Chatbot
while True:
    query = input("Ask a question (or type 'exit' to quit): ")
    if query.lower() == 'exit':
        print("Exiting the chatbot. Goodbye!")
        break
    response = search_query(query, sentences, index, model)
    if response:
        print(f"Answer: {response}")
    else:
        print("Sorry, I didn’t understand your question. Do you want to connect with a live agent?")


Ask a question (or type 'exit' to quit):  Term End Examination 
Answer: Term End Examina;on Eligibility & Policies • To be eligible for the Term End Examina9on, students are expected to complete the academic cycle of the Semester enrolled for.
Ask a question (or type 'exit' to quit): Project Submission 
Sorry, I didn’t understand your question. Do you want to connect with a live agent?
Ask a question (or type 'exit' to quit): exit
Exiting the chatbot. Goodbye!
