In [2]:
# necessary libraries
import numpy as np
import faiss
from transformers import pipeline, AutoTokenizer, AutoModel
from PyPDF2 import PdfReader


In [23]:
# Text extraction from a PDF file
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = "".join([page.extract_text() for page in reader.pages])
    return text


file_path = "D:\GEN AI\Document-Search-Retrieval-System\Syllabus2019(IT) (2).pdf" 
extracted_text = extract_text_from_pdf(file_path)
print("Extracted Text:", extracted_text[:10000])  


Extracted Text:  
 
DEPARTMENT OF INFORMATION TECHNOLOGY  
SYLLABUS OF 2ND TO 4TH YEAR OF THE  
UNDERGRADUATE ENGINEERING DEGREE PROGRAMME  
 
(IT/PC/B/T/211) Data Structures & Algorithms  
 
Introduction: Algorith ms, Order Notation: Time and Space Analysis of Algorithms  
 
Sequential Representations of lists:  Arrays and Lists, Linked Representation - Linear linked lists. 
Circular linked lists. Doubly linked lists. Operations on all types of lists. Applications.  
 
Special Lists:  Stacks, Queues and their applications.  
 
Recursion:  Design of recursive algorithms, Recursion vs. Iteration, Removal of Recursion  
 
Trees  - Binary Trees, Traversals of binary trees, Structural properties of binary trees. Representation of 
binary  trees in terms of pointers and arrays. General trees   
 
Binary Search Trees:  Search, Insertion and Deletion algorithms, Structural properties.  Threaded Binary 
trees.  
 
Balanced Binary Search Trees:  AVL tree, B -trees, B+ - trees.  
 
Graphs:  Repr

In [32]:
# Load pre-trained sentence transformer model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Function to generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding

# Generate embedding for the extracted text
embedding = generate_embedding(extracted_text)
print("Generated Embedding Shape:", embedding.shape)


Generated Embedding Shape: (1, 384)


In [34]:
# Initialize FAISS index
dimension = 384 # Embedding size for MiniLM model
index = faiss.IndexFlatL2(dimension)

# Add the embedding to the FAISS index
index.add(np.array(embedding, dtype=np.float32))

# Check the number of indexed vectors
print("Number of indexed vectors:", index.ntotal)


Number of indexed vectors: 1


In [40]:
# Query for similarity search
query = "Spanning Tree Algorithms ?"
query_embedding = generate_embedding(query)

# Search FAISS for similar documents
D, I = index.search(np.array(query_embedding, dtype=np.float32), k=6)
print("Indices of top 5 results:", I)


Indices of top 5 results: [[ 0 -1 -1 -1 -1 -1]]


In [41]:
# Load QA model pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Example QA based on the retrieved text
context = extracted_text  # In a real case, you'd use the most relevant passages.
result = qa_pipeline(question=query, context=context)
print("Answer:", result['answer'])


Answer: Minimal 
Spanning Tree Algorithms  
 
Sorting
