In [1]:
!pip install scikit-learn sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [10]:
!pip install PyMuPDF sentence-transformers

import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Upload the PDF
from google.colab import files
uploaded = files.upload()

# Extract text from the PDF
file_name = list(uploaded.keys())[0]
documents = []
with fitz.open(file_name) as pdf:
    for page in pdf:
        text = page.get_text()
        documents.append(text)

# Strip whitespace and merge pages into documents
documents = [doc.strip() for doc in documents if doc.strip()]

# Initialize SentenceTransformer
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function for Keyword-based Search
def keyword_search(query, documents):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    query_tfidf = tfidf_vectorizer.transform([query])

    # Calculate cosine similarity between query and all documents
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    return cosine_similarities

# Function for Semantic Search
def semantic_search(query, documents, semantic_model):
    # Create embeddings for the documents and the query
    doc_embeddings = semantic_model.encode(documents)
    query_embedding = semantic_model.encode([query])

    # Calculate cosine similarity between query embedding and document embeddings
    semantic_similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
    return semantic_similarities

# Function to Combine Keyword and Semantic Search Scores
def hybrid_search(query, documents, semantic_model):
    # Get the keyword and semantic scores
    keyword_scores = keyword_search(query, documents)
    semantic_scores = semantic_search(query, documents, semantic_model)

    # Normalize the scores between 0 and 1 (if needed)
    keyword_scores = (keyword_scores - keyword_scores.min()) / (keyword_scores.max() - keyword_scores.min())
    semantic_scores = (semantic_scores - semantic_scores.min()) / (semantic_scores.max() - semantic_scores.min())

    # Combine scores: Adjust weights to control the importance of each part
    hybrid_scores = 0.5 * keyword_scores + 0.5 * semantic_scores  # Adjust weights if needed

    # Get the index of the document with the highest combined score
    best_match_idx = np.argmax(hybrid_scores)

    # Return the document with the highest hybrid score
    return documents[best_match_idx], hybrid_scores[best_match_idx]

# Test the hybrid search
query = "What is PATH FOLLOWING LAB ASSISTANT"
best_document, score = hybrid_search(query, documents, semantic_model)

print(f"Query: {query}")
print(f"Best Document: {best_document}")
print(f"Combined Score: {score}")




Saving Research_paper (1).pdf to Research_paper (1) (4).pdf
Query: What is PATH FOLLOWING LAB ASSISTANT
Best Document: Copyright © 2023 The Author(s): This is an open-access article distributed under the terms of the Creative 
Commons Attribution 4.0 International License (CC BY-NC 4.0) which permits unrestricted use, distribution, and 
reproduction in any medium for non-commercial use provided the original author and source are credited. 
 
 
 
International Journal of Scientific Research in Computer Science, Engineering and 
Information Technology 
ISSN : 2456-3307 
 
Available Online at : www.ijsrcseit.com 
doi : https://doi.org/10.32628/CSEIT2390622 
 
 
 
 
 
 
 
168 
Learn Buddy : Path Following Lab Assistant Robot 
Jitendra Gaikwad, Raj Patil, Prathmesh Raut, Divyesh Thakur 
Instrumentation and Control Engineering, Vishwakarma Institute of Technology, Pune, Maharashtra, India 
 
A R T I C L E I N F O 
 
A B S T R A C T 
Article History: 
Accepted:  15 Nov 2023 
Published: 30 Nov

# Hybrid Search is typically best for large document collections, combining speed and depth, while Semantic Search excels in understanding complex language and context.


In [11]:
!pip install sentence-transformers PyPDF2

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2

# Initialize the SentenceTransformer model
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = []
        for page in reader.pages:
            text.append(page.extract_text())
    return [line.strip() for line in " ".join(text).split('\n') if line.strip()]

# Function for Semantic Search
def semantic_search(query, documents, semantic_model):
    # Create embeddings for the documents and the query
    doc_embeddings = semantic_model.encode(documents)
    query_embedding = semantic_model.encode([query])

    # Calculate cosine similarity between query embedding and document embeddings
    semantic_similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()

    # Find the best matching document and its score
    best_match_idx = semantic_similarities.argmax()
    return documents[best_match_idx], semantic_similarities[best_match_idx]

# Upload the PDF file in Colab
from google.colab import files
uploaded = files.upload()

# Extract text from the uploaded PDF
file_name = list(uploaded.keys())[0]
documents = extract_text_from_pdf(file_name)

# Example query
query = " What is PATH FOLLOWING LAB ASSISTANT"

# Perform semantic search
best_document, score = semantic_search(query, documents, semantic_model)

print(f"Query: {query}")
print(f"Best Document: {best_document}")
print(f"Semantic Score: {score}")




Saving Research_paper.pdf to Research_paper (6).pdf
Query:  What is PATH FOLLOWING LAB ASSISTANT
Best Document: Learn Buddy  : Path Following Lab Assistant Robot
Semantic Score: 0.7503138780593872
