In [None]:
import os
import fitz  #fitz we'll extract text from PDF

To import fitz, we need PyMuPDF library

In [None]:
pip install PyMuPDF

In [None]:
#Extracting text from all pdf
def extract_text_from_pdf(folder_path):
  texts = []
  for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
      pdf_pdf = os.path.join(folder_path, filename)
      doc = fitz.open(pdf_pdf)
      text = " "
      for page in doc:
        text += page.get_text("text")
  return text

#Extracting text from all pdf
def extract_text_from_pdf(folder_path):
  texts = []
  for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
      pdf_pdf = os.path.join(folder_path, filename)
      doc = fitz.open(pdf_path)
      text = " "
      for page in doc:
        text += page.get_text(text)
  return texts

pdf_folder = ["Rohith_CV.pdf","budget_speech"]
document = extract_text_from_pdf("demoo/")

Problems in your code:

Variable name mismatch:
You wrote pdf_pdf = ... but then tried fitz.open(pdf_path) â†’ should be consistent.

Wrong get_text() usage:
page.get_text("text") is correct (not page.get_text(text)).

Not appending text to texts list:
Right now your function never returns any extracted content.

pdf_folder usage:
You set pdf_folder = ["Rohith_CV.pdf","budget_speech"], but the function expects a folder path (string, not list).

Fixed code:

import os
import fitz  # PyMuPDF

# Extract text from all PDFs in a folder
def extract_text_from_pdf(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text("text")  # extract page text
            texts.append(text)
    return texts

# Example usage
pdf_folder = "demoo/"   # this should be a folder containing PDFs
documents = extract_text_from_pdf(pdf_folder)

print("Extracted", len(documents), "PDFs")
print(documents[0][:500])  # print first 500 chars of first PDF

In [None]:
pdf_folder = ("demo/")

In [None]:
pdf_folder

In [None]:
document = extract_text_from_pdf(pdf_folder)

In [None]:
print(document[1])

In [None]:
print("Extracted", len(document), "PDFsTotalLength")

In [None]:
print(document[1][:250])

if youâ€™re getting a blank string, that usually means the PDF doesnâ€™t contain real text, but instead is made up of scanned images.

Hereâ€™s why this happens:
fitz (PyMuPDF), pdfplumber, or PyPDF2 can only extract text thatâ€™s actually encoded in the PDF.
If your PDF is a scanned image, it has no embedded text â†’ youâ€™ll need OCR (Optical Character Recognition).

Quick test:
print(documents[0])
If itâ€™s empty â†’ the PDF is image-based.

OCR method for scanned PDFs
You can use pytesseract + pdf2image

Note:
If your PDF is text-based, fitz should work.
If itâ€™s image-based, you need OCR.

In [None]:
pip install pytesseract pdf2image pillow

In [None]:
from pdf2image import convert_from_path
import pytesseract

In [None]:
def extract_text_with_ocr(pdf_path):
  pages = convert_from_path(pdf_path)
  text = " "
  for page in pages:
    text+=pytesseract.image_to_string(page)
  return text

In [None]:
pdf_path = "demo/Rohith_CV.pdf"

In [None]:
ocr_text = extract_text_with_ocr(pdf_path)

that error means pdf2image is working, but Poppler (the backend tool it needs) isnâ€™t installed on your system.

If installing Poppler is a hassle, you can skip pdf2image and directly use PyMuPDFâ€™s built-in OCR (since version 1.23.0)

In [None]:
doc = fitz.open("demo/Rohith_CV.pdf")
text = ""
for page in doc:
  text += page.get_text("text")
  if not text.strip():
    text += page.get_text("ocr")
print(text[:500])

budget_speech.pdf â†’ text-based (works directly with fitz).
Rohith_CV.pdf â†’ image-based (needs OCR).
sk.pdf â†’ unknown (we need to detect if text exists, otherwise fallback to OCR).

In [None]:
def extract_text_auto(pdf_path):
  doc = fitz.open(pdf_path)
  text=""
  ocr_needed = False

  for page in doc:
    page_text = page.get_text("text")
    if page_text.strip():
      text += page_text
    else:
      ocr_needed = True

  doc.close()

  if ocr_needed:
    print(f"Using OCR for: {pdf_path}")
    pages = convert_from_path(pdf_path)
    for img_pages in pages:
      text+=pytesseract.image_to_string(img_pages)

  return text


In [None]:
folder = "demo"
all_text = {}

In [None]:
for file in os.listdir(folder):
  if file.endswith(".pdf"):
    pdf_path = os.path.join(folder, file)
    extracted_text = extract_text_auto(pdf_path)
    all_text[file] = extracted_text
    print(f"Extracted {len(extracted_text)} characters from {file}")

Hereâ€™s whatâ€™s happening in your run:
Rohith_CV.pdf â†’ OCR worked fine (3114 characters).
sk.pdf â†’ text extraction worked fine (4940 characters).
budget_speech.pdf â†’ wrongly went to OCR path, then failed because Poppler (pdfinfo) is missing.
That means budget_speech.pdf actually does have text, but our function mistakenly decided it needed OCR.

We donâ€™t want to call pdf2image at all unless the PDF really has no text anywhere.
Right now, if just one page is empty, we switch to OCR for the whole file â†’ thatâ€™s why budget speech triggered OCR unnecessarily.

In [None]:
#Improved Function (OCR only if entire file is empty)
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path

def extract_text_auto(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    # Try extracting text normally from all pages
    for page in doc:
        page_text = page.get_text("text")
        if page_text.strip():
            text += page_text

    doc.close()

    # If no text found at all â†’ fallback to OCR
    if not text.strip():
        print(f"âš¡ Using OCR for: {pdf_path}")
        pages = convert_from_path(pdf_path)
        for img_page in pages:
            text += pytesseract.image_to_string(img_page)

    return text


Why this works

budget_speech.pdf â†’ has text â†’ will NOT trigger OCR anymore.
Rohith_CV.pdf â†’ no text at all â†’ will trigger OCR.
sk.pdf â†’ already worked with text â†’ stays the same.

In [None]:
folder = "demo"
all_text = {}

In [None]:
for file in os.listdir(folder):
  if file.endswith(".pdf"):
    pdf_path = os.path.join(folder, file)
    extracted_text = extract_text_auto(pdf_path)
    all_text[file] = extracted_text
    print(f"Extracted {len(extracted_text)} characters from {file}")

In [None]:
#Now save results into .txt
for fname, content in all_text.items():
  with open(fname.replace(".pdf",".txt"),"w",encoding="utf-8") as f:f.write(content)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
documents = list(all_text.values())

In [None]:
processed_docs = [doc.lower().strip() for doc in documents]

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(processed_docs)

In [None]:
print("shape of TF-IDF matrix:",X.shape)

In [None]:
print(vectorizer.get_feature_names_out()[:50])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(X)
print(similarities)

In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    token_pattern=r"[a-zA-Z]{2,}",
    max_features=5000
)

X = vectorizer.fit_transform(processed_docs)

In [None]:
print(vectorizer.get_feature_names_out()[:50])

In [None]:
filenames = list(all_text.keys())   # same order as documents

In [None]:
import numpy as np

feature_names = vectorizer.get_feature_names_out()

def top_keywords_for_doc(doc_index, top_n=10):
    row = X[doc_index].toarray().flatten()   # TF-IDF values for that doc
    top_indices = row.argsort()[::-1][:top_n]
    keywords = [(feature_names[i], row[i]) for i in top_indices]
    return keywords

# Example: top keywords for each file
for i, fname in enumerate(filenames):
    print(f"\n {fname}")
    print(top_keywords_for_doc(i, top_n=10))


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(X)

# Example: compare first doc with others
print("Similarity of first doc vs all:")
for i, fname in enumerate(filenames):
    print(f"{filenames[2]} vs {fname}: {similarities[2][i]:.3f}")


In [None]:
query = "machine learning"
query_vec = vectorizer.transform([query])

similarities = cosine_similarity(query_vec, X).flatten()

# Sort by relevance
results = sorted(zip(filenames, similarities), key=lambda x: -x[1])

print("\nðŸ”Ž Search results for:", query)
for fname, score in results:
    print(f"{fname}: {score:.3f}")


In [None]:
print("machine" in vectorizer.get_feature_names_out())
print("learning" in vectorizer.get_feature_names_out())
print("machine learning" in all_text["Rohith_CV.pdf"].lower())