In [None]:
!pip install fitz
!pip install pymupdf
!pip install transformers
!pip install torch
!pip install numpy
!pip install scikit-learn

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Collecting configobj (from fitz)
  Downloading configobj-5.0.8-py2.py3-none-any.whl (36 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.0.0-py3-none-any.whl (16 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.8.6-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.2-py3-none-any.whl (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl (421 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━

In [None]:
import fitz  # PyMuPDF for extracting text from PDFs
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np

# Extract text from all pages of a PDF file
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    all_text = []
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text = page.get_text()
        all_text.append(text)
    return all_text

# Embed text using a pre-trained model
def embed_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Search for relevant page and answer based on query
def search(query, embeddings, documents, model, tokenizer, top_k=1):
    query_embedding = embed_text(query, model, tokenizer)
    similarities = [torch.cosine_similarity(query_embedding, doc_emb, dim=1).item() for doc_emb in embeddings]
    sorted_indices = np.argsort(similarities)[::-1][:top_k]
    most_relevant_index = sorted_indices[0]
    most_relevant_page = most_relevant_index + 1  # Adjust for 1-based index
    answer_text = documents[most_relevant_index]
    return most_relevant_page, answer_text

# Load the pre-trained model and tokenizer
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define the path to your PDF
pdf_path = '/content/DL BOOK .pdf'  # Replace with your actual PDF path

# Extract text from all pages of the PDF
pdf_texts = extract_text_from_pdf(pdf_path)

# Generate embeddings for each page's text
embeddings = [embed_text(text, model, tokenizer) for text in pdf_texts]

# Perform a search with a query
query = "Single Computational Layer: The Perceptron"  # Replace with your desired query
most_relevant_page, answer_text = search(query, embeddings, pdf_texts, model, tokenizer)
print(f"Most Relevant Page: {most_relevant_page}")
print(f"Answer Text: {answer_text}")


Most Relevant Page: 26
Answer Text: 6
CHAPTER 1. AN INTRODUCTION TO NEURAL NETWORKS
The architecture of the perceptron is shown in Figure 1.3(a), in which a single input layer
transmits the features to the output node. The edges from the input to the output contain
the weights w1 . . . wd with which the features are multiplied and added at the output node.
Subsequently, the sign function is applied in order to convert the aggregated value into a
class label. The sign function serves the role of an activation function. Diﬀerent choices
of activation functions can be used to simulate diﬀerent types of models used in machine
learning, like least-squares regression with numeric targets, the support vector machine,
or a logistic regression classiﬁer. Most of the basic machine learning models can be easily
represented as simple neural network architectures. It is a useful exercise to model traditional
machine learning techniques as neural architectures, because it provides a clearer picture 

In [None]:
import fitz  # PyMuPDF for extracting text from PDFs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to perform textual search in a PDF
def search_in_pdf(pdf_path, query):
    # Extract text from all pages of the PDF
    def extract_text_from_pdf(pdf_path):
        document = fitz.open(pdf_path)
        all_text = []
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text = page.get_text()
            all_text.append(text)
        return all_text

    # Extract text from the PDF
    pdf_texts = extract_text_from_pdf(pdf_path)

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer on the PDF text and transform it to obtain TF-IDF vectors
    tfidf_vectors = vectorizer.fit_transform(pdf_texts)

    # Transform the query into a TF-IDF vector
    query_vector = vectorizer.transform([query])

    # Compute cosine similarity between the query vector and all document vectors
    similarities = cosine_similarity(query_vector, tfidf_vectors).flatten()

    # Find the index of the most similar page
    most_similar_index = similarities.argmax()

    # Return the most relevant page number and its text
    most_relevant_page = most_similar_index + 1  # Adjust for 1-based index
    answer_text = pdf_texts[most_similar_index]

    return most_relevant_page, answer_text

# Example usage:
pdf_path = '/content/DL BOOK .pdf'  # Replace with your actual PDF path
query = "Single Computational Layer: The Perceptron"

most_relevant_page, answer_text = search_in_pdf(pdf_path, query)
print(f"Most Relevant Page: {most_relevant_page}")
print(f"Answer Text: {answer_text}")


Most Relevant Page: 25
Answer Text: 1.2. THE BASIC ARCHITECTURE OF NEURAL NETWORKS
5
INPUT NODES 
∑
OUTPUT NODE 
y 
w1 
w2 
w3 
  w4 
x4
x3
x2
x1
x5
 w5  
INPUT NODES 
∑
OUTPUT NODE 
w1 
w2 
w3 
  w4 
 w5  
b 
+1 BIAS NEURON 
y 
x4
x3
x2
x1
x5
(a) Perceptron without bias
(b) Perceptron with bias
Figure 1.3: The basic architecture of the perceptron
1.2.1
Single Computational Layer: The Perceptron
The simplest neural network is referred to as the perceptron. This neural network contains
a single input layer and an output node. The basic architecture of the perceptron is shown
in Figure 1.3(a). Consider a situation where each training instance is of the form (X, y),
where each X = [x1, . . . xd] contains d feature variables, and y ∈{−1, +1} contains the
observed value of the binary class variable. By “observed value” we refer to the fact that it
is given to us as a part of the training data, and our goal is to predict the class variable for
cases in which it is not observed. For example, 

In [None]:
import fitz  # PyMuPDF for extracting text from PDFs
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to perform hybrid search in a PDF
def hybrid_search_in_pdf(pdf_path, query, model_name='sentence-transformers/all-MiniLM-L6-v2', top_k=1):
    # Extract text from all pages of the PDF
    def extract_text_from_pdf(pdf_path):
        document = fitz.open(pdf_path)
        all_text = []
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text = page.get_text()
            all_text.append(text)
        return all_text

    # Embed text using a pre-trained model
    def embed_text(text, model, tokenizer):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            embeddings = model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings

    # Create TF-IDF vectors from text
    def create_tfidf_vectors(texts):
        vectorizer = TfidfVectorizer()
        tfidf_vectors = vectorizer.fit_transform(texts)
        return vectorizer, tfidf_vectors

    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Extract text from all pages of the PDF
    pdf_texts = extract_text_from_pdf(pdf_path)

    # Embeddings for each page's text
    page_embeddings = [embed_text(text, model, tokenizer) for text in pdf_texts]

    # TF-IDF vectors for textual search
    tfidf_vectorizer, tfidf_vectors = create_tfidf_vectors(pdf_texts)

    # Embed the query using the pre-trained model
    query_embedding = embed_text(query, model, tokenizer)

    # Compute cosine similarity between query embedding and page embeddings
    similarities = [torch.cosine_similarity(query_embedding, emb, dim=1).item() for emb in page_embeddings]
    semantic_scores = np.array(similarities)

    # Use TF-IDF to compute textual similarity
    query_vector = tfidf_vectorizer.transform([query])
    tfidf_similarities = cosine_similarity(query_vector, tfidf_vectors).flatten()

    # Combine semantic and textual scores
    combined_scores = semantic_scores + tfidf_similarities

    # Rank pages based on combined scores
    ranked_indices = np.argsort(combined_scores)[::-1][:top_k]
    ranked_pages = [(idx + 1, pdf_texts[idx]) for idx in ranked_indices]  # Adjust for 1-based index

    return ranked_pages

# Example usage:
pdf_path = '/content/DL BOOK .pdf'  # Replace with your actual PDF path
query = "Single Computational Layer: The Perceptron"
top_k = 1  # Number of top results to retrieve

results = hybrid_search_in_pdf(pdf_path, query, top_k=top_k)

# Print the top result
if results:
    page_number, page_text = results[0]
    print(f"Most Relevant Page: {page_number}")
    print(f"Answer Text: {page_text}")
else:
    print("No relevant page found.")


KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
HF_TOKEN = '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'

In [None]:
with open('/content/drive/MyDrive/hf_token.txt', 'w') as f:
    f.write(HF_TOKEN)

In [None]:
import os

with open('/content/drive/MyDrive/hf_token.txt', 'r') as f:
    HF_TOKEN = f.read().strip()

os.environ['HF_TOKEN'] = HF_TOKEN

In [None]:
from huggingface_hub import login

login(token=HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!pip install huggingface_hub



In [None]:
import fitz  # PyMuPDF for extracting text from PDFs
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to perform hybrid search in a PDF
def hybrid_search_in_pdf(pdf_path, query, model_name='sentence-transformers/all-MiniLM-L6-v2', top_k=1):
    # Extract text from all pages of the PDF
    def extract_text_from_pdf(pdf_path):
        document = fitz.open(pdf_path)
        all_text = []
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text = page.get_text()
            all_text.append(text)
        return all_text

    # Embed text using a pre-trained model
    def embed_text(text, model, tokenizer):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            embeddings = model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings

    # Create TF-IDF vectors from text
    def create_tfidf_vectors(texts):
        vectorizer = TfidfVectorizer()
        tfidf_vectors = vectorizer.fit_transform(texts)
        return vectorizer, tfidf_vectors

    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Extract text from all pages of the PDF
    pdf_texts = extract_text_from_pdf(pdf_path)

    # Embeddings for each page's text
    page_embeddings = [embed_text(text, model, tokenizer) for text in pdf_texts]

    # TF-IDF vectors for textual search
    tfidf_vectorizer, tfidf_vectors = create_tfidf_vectors(pdf_texts)

    # Embed the query using the pre-trained model
    query_embedding = embed_text(query, model, tokenizer)

    # Compute cosine similarity between query embedding and page embeddings
    similarities = [torch.cosine_similarity(query_embedding, emb, dim=1).item() for emb in page_embeddings]
    semantic_scores = np.array(similarities)

    # Use TF-IDF to compute textual similarity
    query_vector = tfidf_vectorizer.transform([query])
    tfidf_similarities = cosine_similarity(query_vector, tfidf_vectors).flatten()

    # Combine semantic and textual scores
    combined_scores = semantic_scores + tfidf_similarities

    # Rank pages based on combined scores
    ranked_indices = np.argsort(combined_scores)[::-1][:top_k]
    ranked_pages = [(idx + 1, pdf_texts[idx]) for idx in ranked_indices]  # Adjust for 1-based index

    return ranked_pages

# Example usage:
pdf_path = '/content/DL BOOK .pdf'  # Replace with your actual PDF path
query = "Single Computational Layer: The Perceptron"
top_k = 1  # Number of top results to retrieve

results = hybrid_search_in_pdf(pdf_path, query, top_k=top_k)

# Print the top result
if results:
    page_number, page_text = results[0]
    print(f"Most Relevant Page: {page_number}")
    print(f"Answer Text: {page_text}")
else:
    print("No relevant page found.")


Most Relevant Page: 25
Answer Text: 1.2. THE BASIC ARCHITECTURE OF NEURAL NETWORKS
5
INPUT NODES 
∑
OUTPUT NODE 
y 
w1 
w2 
w3 
  w4 
x4
x3
x2
x1
x5
 w5  
INPUT NODES 
∑
OUTPUT NODE 
w1 
w2 
w3 
  w4 
 w5  
b 
+1 BIAS NEURON 
y 
x4
x3
x2
x1
x5
(a) Perceptron without bias
(b) Perceptron with bias
Figure 1.3: The basic architecture of the perceptron
1.2.1
Single Computational Layer: The Perceptron
The simplest neural network is referred to as the perceptron. This neural network contains
a single input layer and an output node. The basic architecture of the perceptron is shown
in Figure 1.3(a). Consider a situation where each training instance is of the form (X, y),
where each X = [x1, . . . xd] contains d feature variables, and y ∈{−1, +1} contains the
observed value of the binary class variable. By “observed value” we refer to the fact that it
is given to us as a part of the training data, and our goal is to predict the class variable for
cases in which it is not observed. For example, 