In [18]:
pip install opencv-python pytesseract torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [19]:
import cv2
import pytesseract
import numpy as np
import torch
from torchvision import transforms
from PIL import Image

In [20]:
pip install opencv-python pdf2image numpy

Note: you may need to restart the kernel to use updated packages.


In [21]:
import cv2
import glob

folder_path = "Text Search in Scanned Manuscript/sample_data"

# Get all image files in the folder
image_files = glob.glob(folder_path + "/*.jpg") + glob.glob(folder_path + "/*.png") + glob.glob(folder_path + "/*.jpeg")

# Read images
images = [cv2.imread(img) for img in image_files]

# Display the first image (optional)
if images:
    cv2.imshow("First Image", images[0])
    cv2.waitKey(0)
    cv2.destroyAllWindows()


In [22]:
import os

folder_path = r"C:\Users\SONALI\Text Search in Scanned Manuscript\sample_data"
print("Checking folder path:", os.path.abspath(folder_path))

if os.path.exists(folder_path):
    print("Folder exists!")
else:
    print("Folder does not exist! Check the path again.")



Checking folder path: C:\Users\SONALI\Text Search in Scanned Manuscript\sample_data
Folder exists!


In [28]:
#preprocess
import cv2
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor  # Use threading instead of multiprocessing

# Paths
folder_path = "sample_data"
output_folder = "preprocessed_data"

# Create output directory if not exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# List image files
image_files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

def preprocess_image(file):
    """Preprocess a single image."""
    img_path = os.path.join(folder_path, file)
    save_path = os.path.join(output_folder, file)

    # Load and resize image (reduces computation)
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    img = cv2.resize(img, (1024, 1024))  # Resize to speed up processing

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Denoising with a smaller blur kernel
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)

    # Faster thresholding using Otsu's method
    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Faster Morphological Operations
    kernel = np.ones((1,1), np.uint8)  # Smaller kernel
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # Save the processed image
    cv2.imwrite(save_path, cleaned)

    return f"Processed: {file}"

# Use threading to speed up processing
with ThreadPoolExecutor() as executor:
    results = list(executor.map(preprocess_image, image_files))

# Print results
for res in results:
    print(res)

print("✅ All images preprocessed and saved in:", output_folder)


Processed: pic1.jpg
Processed: pic10.jpg
Processed: pic11.jpg
Processed: pic12.jpg
Processed: pic13.jpg
Processed: pic14.jpg
Processed: pic15.jpg
Processed: pic16.jpg
Processed: pic17.jpg
Processed: pic18.jpg
Processed: pic19.jpg
Processed: pic2.jpg
Processed: pic20.jpg
Processed: pic21.jpg
Processed: pic22.jpg
Processed: pic23.jpg
Processed: pic24.jpg
Processed: pic25.jpg
Processed: pic26.jpg
Processed: pic27.jpg
Processed: pic28.jpg
Processed: pic29.jpg
Processed: pic3.jpg
Processed: pic30.jpg
Processed: pic31.jpg
Processed: pic32.jpg
Processed: pic33.jpg
Processed: pic34.jpg
Processed: pic35.jpg
Processed: pic36.jpg
Processed: pic37.jpg
Processed: pic38.jpg
Processed: pic39.jpg
Processed: pic4.jpg
Processed: pic40.jpg
Processed: pic41.jpg
Processed: pic42.jpg
Processed: pic43.jpg
Processed: pic44.jpg
Processed: pic45.jpg
Processed: pic46.jpg
Processed: pic47.jpg
Processed: pic48.jpg
Processed: pic49.jpg
Processed: pic5.jpg
Processed: pic50.jpg
Processed: pic6.jpg
Processed: pic7.jpg

In [30]:
pip install pymupdf

Note: you may need to restart the kernel to use updated packages.


In [32]:
import fitz  # PyMuPDF
import os

# PDF file path
pdf_path = "dataset.pdf"  # Change this to your PDF file
output_folder = "pdf_images"

# Create output directory if not exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Open the PDF
pdf_document = fitz.open(pdf_path)

# Loop through all the pages and convert them to images
image_paths = []
for page_num in range(pdf_document.page_count):
    page = pdf_document.load_page(page_num)  # Load each page
    pix = page.get_pixmap(dpi=300)  # Render page as image at 300 DPI
    img_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
    pix.save(img_path)  # Save the image as PNG
    image_paths.append(img_path)

print(f"✅ {len(image_paths)} pages extracted and saved as images.")




✅ 13 pages extracted and saved as images.


In [33]:
#preprocess of pdf
import cv2
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor  # Use threading instead of multiprocessing

# Paths
folder_path = "pdf_images"
output_folder = "processed_images"

# Create output directory if not exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# List image files
image_files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

def preprocess_image(file):
    """Preprocess a single image."""
    img_path = os.path.join(folder_path, file)
    save_path = os.path.join(output_folder, file)

    # Load and resize image (reduces computation)
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    img = cv2.resize(img, (1024, 1024))  # Resize to speed up processing

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Denoising with a smaller blur kernel
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)

    # Faster thresholding using Otsu's method
    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Faster Morphological Operations
    kernel = np.ones((1,1), np.uint8)  # Smaller kernel
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # Save the processed image
    cv2.imwrite(save_path, cleaned)

    return f"Processed: {file}"

# Use threading to speed up processing
with ThreadPoolExecutor() as executor:
    results = list(executor.map(preprocess_image, image_files))

# Print results
for res in results:
    print(res)

print("✅ All images preprocessed and saved in:", output_folder)



Processed: page_1.png
Processed: page_10.png
Processed: page_11.png
Processed: page_12.png
Processed: page_13.png
Processed: page_2.png
Processed: page_3.png
Processed: page_4.png
Processed: page_5.png
Processed: page_6.png
Processed: page_7.png
Processed: page_8.png
Processed: page_9.png
✅ All images preprocessed and saved in: processed_images


In [34]:
#Text Detection
#useing the tesseract model 
!pip install tesseract



In [35]:
#Text Detection 50 images
import os
import pytesseract
from pytesseract import Output
from PIL import Image

# Define output folder containing preprocessed images
output_folder = "preprocessed_data"  # Update this path

# Path to Tesseract executable (update this according to your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def detect_text(file):
    """Detect text in a single preprocessed image."""
    img_path = os.path.join(output_folder, file)

    # Load the preprocessed image
    img = Image.open(img_path)

    # Use Tesseract to detect text
    text_data = pytesseract.image_to_data(img, output_type=Output.DICT)

    # Extract text and bounding box information
    detected_text = []
    for i in range(len(text_data['text'])):
        if text_data['text'][i].strip() and int(text_data['conf'][i]) > 60:  # Confidence threshold
            (x, y, w, h) = (text_data['left'][i], text_data['top'][i], text_data['width'][i], text_data['height'][i])
            text = text_data['text'][i]
            detected_text.append((text, (x, y, w, h)))

    return detected_text if detected_text else [("No text detected", (0, 0, 0, 0))]

# Get list of images in the output folder
image_files = [f for f in os.listdir(output_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]

# Detect text in all preprocessed images
text_results = {file: detect_text(file) for file in image_files}

# Print detected text for each image
for file, text in text_results.items():
    print(f"Detected text in {file}:")
    for t, bbox in text:
        print(f"Text: {t}, Bounding Box: {bbox}")
    print("\n")

print("✅ Text detection completed.")





Detected text in pic1.jpg:
Text: A, Bounding Box: (954, 707, 14, 19)
Text: ., Bounding Box: (265, 785, 6, 5)
Text: as, Bounding Box: (251, 849, 13, 26)
Text: t, Bounding Box: (750, 836, 19, 63)


Detected text in pic10.jpg:
Text: y, Bounding Box: (659, 113, 14, 23)
Text: F, Bounding Box: (943, 112, 21, 22)
Text: (, Bounding Box: (190, 207, 11, 29)
Text: ~, Bounding Box: (289, 211, 24, 4)
Text: fo, Bounding Box: (483, 240, 14, 33)
Text: Dy, Bounding Box: (421, 378, 89, 39)
Text: Hall, Bounding Box: (768, 631, 58, 38)
Text: “, Bounding Box: (483, 885, 8, 7)
Text: -, Bounding Box: (394, 918, 10, 3)


Detected text in pic11.jpg:
Text: C., Bounding Box: (162, 83, 52, 58)
Text: /, Bounding Box: (193, 197, 12, 19)
Text: (, Bounding Box: (187, 219, 6, 16)
Text: Threw, Bounding Box: (86, 232, 100, 49)
Text: bo, Bounding Box: (399, 336, 20, 39)
Text: in, Bounding Box: (134, 367, 48, 18)
Text: ten, Bounding Box: (324, 387, 41, 22)
Text: g, Bounding Box: (754, 426, 19, 25)
Text: ., Bounding Box: (

In [36]:
#Text Detection for pdf
import os
import pytesseract
from pytesseract import Output
from PIL import Image

# Define output folder containing preprocessed images
output_folder = "processed_images"  # Update this path

# Path to Tesseract executable (update this according to your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def detect_text(file):
    """Detect text in a single preprocessed image."""
    img_path = os.path.join(output_folder, file)

    # Load the preprocessed image
    img = Image.open(img_path)

    # Use Tesseract to detect text
    text_data = pytesseract.image_to_data(img, output_type=Output.DICT)

    # Extract text and bounding box information
    detected_text = []
    for i in range(len(text_data['text'])):
        if text_data['text'][i].strip() and int(text_data['conf'][i]) > 60:  # Confidence threshold
            (x, y, w, h) = (text_data['left'][i], text_data['top'][i], text_data['width'][i], text_data['height'][i])
            text = text_data['text'][i]
            detected_text.append((text, (x, y, w, h)))

    return detected_text if detected_text else [("No text detected", (0, 0, 0, 0))]

# Get list of images in the output folder
image_files = [f for f in os.listdir(output_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]

# Detect text in all preprocessed images
text_results = {file: detect_text(file) for file in image_files}

# Print detected text for each image
for file, text in text_results.items():
    print(f"Detected text in {file}:")
    for t, bbox in text:
        print(f"Text: {t}, Bounding Box: {bbox}")
    print("\n")

print("✅ Text detection completed.")


Detected text in page_1.png:
Text: THE, Bounding Box: (687, 152, 83, 20)
Text: VIRGINIA, Bounding Box: (619, 196, 218, 29)
Text: ALMANACK, Bounding Box: (564, 252, 325, 51)
Text: FOR, Bounding Box: (680, 315, 42, 16)
Text: THE, Bounding Box: (734, 316, 45, 16)
Text: YearofourLorp, Bounding Box: (563, 342, 253, 30)
Text: Gop, Bounding Box: (767, 338, 44, 48)
Text: 1762., Bounding Box: (824, 346, 65, 36)
Text: BISSEXTILE,, Bounding Box: (565, 433, 168, 27)
Text: or, Bounding Box: (747, 441, 18, 14)
Text: Toe, Bounding Box: (563, 502, 19, 10)
Text: the, Bounding Box: (826, 504, 14, 11)
Text: Sum, Bounding Box: (848, 504, 20, 11)
Text: ang, Bounding Box: (875, 507, 16, 10)
Text: and, Bounding Box: (812, 521, 15, 9)
Text: Soothing,, Bounding Box: (832, 520, 55, 15)
Text: of, Bounding Box: (882, 510, 8, 30)
Text: red, Bounding Box: (746, 554, 3, 3)
Text: te, Bounding Box: (765, 541, 9, 31)
Text: the, Bounding Box: (785, 550, 10, 11)
Text: HORIZON, Bounding Box: (802, 550, 55, 13)
Text: of, B

In [37]:
#Text Recognition for 50 images
import os
import pytesseract
from pytesseract import Output
from PIL import Image

# Define output folder containing preprocessed images
output_folder = "preprocessed_data"  # Update this path

# Path to Tesseract executable (update this according to your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def recognize_text(file):
    """Recognize text in a single preprocessed image."""
    img_path = os.path.join(output_folder, file)

    # Load the preprocessed image
    img = Image.open(img_path)

    # Use Tesseract to recognize text
    text = pytesseract.image_to_string(img, config="--psm 6")  # Page segmentation mode 6 for better accuracy

    return text.strip() if text.strip() else "No text recognized"

# Recognize text in all preprocessed images
recognized_text_results = {file: recognize_text(file) for file in os.listdir(output_folder) if file.endswith(('.png', '.jpg', '.jpeg'))}

# Print recognized text for each image
for file, text in recognized_text_results.items():
    print(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")

# Optionally, save results to a file
output_file = "recognized_text_results.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for file, text in recognized_text_results.items():
        f.write(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")

print(f"✅ Text recognition completed. Results saved to {output_file}")


Recognized text in pic1.jpg:
rr ae Fd . 1
I. 27
ly, 7 = i
tt C —_- ep |
= ean - ~
: © ane re <9 nT Sie: awe ey 3 tc es 5 -
Vneghay PLM A BE Pe Bick Gong ee
; aT ae, aes Ell Len
ss DOG of seve D 7 LE 9 home» ang -
| ay ald AO Os yp til of a on we.
Stra epee te
| , - Gpesare- recall Liyin. Lions ;
by Lob in ee Ay Pla. Broth CR gh Ra ae” z
i eee «te te Pee . . on — —*
--------------------------------------------------

Recognized text in pic10.jpg:
or
0 rome sil € pte F
SS DADDY Ay >
z ( a gS relleo nw ote ha
Mercer argh Sina gion have lo makin Cpumeteroal gua renga est
& TB bag voamnSinen Aig fim hn as

Vik OB pened yt ym o Aion .. .

Lh wim Oui L pan weal? 2
BDC LE en a

CS hweng ome ef the Mie em om O g

hive he ew tel hes ose Karnes ete Danas 4
p hives Bly. a? at te ; -

bel Ke y.wen Sipue LO Aa ee

my te give MSRM nh haere Ae, Yo -
abs V2 yim Cake ha deore ober gov iQhanioan F
Fae Oe be my Phe ‘were iL 13 2 nde Soke . =

"Sonn a tbl hoa f bog AO _

“Pipher » Wah LL yk tobe

£9 OS pO 

In [38]:
#Text recognition of pdf
import os
import pytesseract
from pytesseract import Output
from PIL import Image

# Define output folder containing preprocessed images
output_folder = "processed_images"  # Update this path

# Path to Tesseract executable (update this according to your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def recognize_text(file):
    """Recognize text in a single preprocessed image."""
    img_path = os.path.join(output_folder, file)

    # Load the preprocessed image
    img = Image.open(img_path)

    # Use Tesseract to recognize text
    text = pytesseract.image_to_string(img, config="--psm 6")  # Page segmentation mode 6 for better accuracy

    return text.strip() if text.strip() else "No text recognized"

# Recognize text in all preprocessed images
recognized_text_resultss = {file: recognize_text(file) for file in os.listdir(output_folder) if file.endswith(('.png', '.jpg', '.jpeg'))}

# Print recognized text for each image
for file, text in recognized_text_resultss.items():
    print(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")

# Optionally, save results to a file
output_file = "recognized_text_resultss.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for file, text in recognized_text_resultss.items():
        f.write(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")

print(f"✅ Text recognition completed. Results saved to {output_file}")


Recognized text in page_1.png:
THE
FOR THE

|YearofourLorp Gop 1762.

jPISSEXTILE, or Leap-Year,

q wrotrin ane convareap

Tor Lewarrons, Congerctions, Beetrsgs; the Sum and
Moce’t Ritng and Serz.ngy the Ring, Set ag and Soothing, of
the Heaveaty Booire; Weasatn; £2 Catcenaven)
AcceaDinG To AeT 5 10d refer t© the HORIZON of 48
Degrees Riv Lave, ind a Mendis of Five Hours Wf from
the Cay of Lenin, feting Singin, Merlard, NevéCoriua,|
ke malle a Take of Coaw-Davi ; Deferptin the Rowe
threegh the Conrixeas with a bib cf the COUNCIL and
Hocse of Beecevens of Virginie:

] To muscu 1s aot,

JA Colle on ef appersee Matis, enterusing Evtce ame, etcu
Asreporns, daertng Srovits, &¢, Ge. Se, Calened
Ieyrece Thom and Amt: eMknT.

——

By THEOPHILUS Wreg, Philm

——

Fer to pleafe Ail, we frive with all eur Might;
But kaze, ‘tis for your Money thar tue verite,

——————q—~_—_oe

i

| WILLIAMSBURG:

Printed and Sold by Joseru Roviz, and Ce

en |
--------------------------------------------------

Recogn

In [39]:
import re

def clean_text(text):
    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters (keep punctuation if required)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    return text

In [1]:
#text recognition to improve accuracy 
import os
import pytesseract
from PIL import Image

# Define output folder containing preprocessed images
output_folder = "preprocessed_data"  # Update this path

# Path to Tesseract executable (update this according to your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def recognize_text(file):
    """Recognize text in a single preprocessed image."""
    img_path = os.path.join(output_folder, file)
    
    try:
        # Load the preprocessed image
        img = Image.open(img_path)
        
        # Use Tesseract to recognize text
        text = pytesseract.image_to_string(img, config="--psm 6")  # Page segmentation mode 6 for better accuracy
        
        return text.strip() if text.strip() else "No text recognized"
    except Exception as e:
        return f"Error processing {file}: {str(e)}"

if not os.path.exists(output_folder):
    print(f"Error: Output folder '{output_folder}' does not exist.")
else:
    # Recognize text in all preprocessed images
    recognized_text_results = {
        file: recognize_text(file) for file in os.listdir(output_folder) if file.endswith(('.png', '.jpg', '.jpeg'))
    }
    
    # Print recognized text for each image
    for file, text in recognized_text_results.items():
        print(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")
    
    # Optionally, save results to a file
    output_file = "recognized_text_results.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        for file, text in recognized_text_results.items():
            f.write(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")
    
    print(f"✅ Text recognition completed. Results saved to {output_file}")

Recognized text in pic1.jpg:
rr ae Fd . 1
I. 27
ly, 7 = i
tt C —_- ep |
= ean - ~
: © ane re <9 nT Sie: awe ey 3 tc es 5 -
Vneghay PLM A BE Pe Bick Gong ee
; aT ae, aes Ell Len
ss DOG of seve D 7 LE 9 home» ang -
| ay ald AO Os yp til of a on we.
Stra epee te
| , - Gpesare- recall Liyin. Lions ;
by Lob in ee Ay Pla. Broth CR gh Ra ae” z
i eee «te te Pee . . on — —*
--------------------------------------------------

Recognized text in pic10.jpg:
or
0 rome sil € pte F
SS DADDY Ay >
z ( a gS relleo nw ote ha
Mercer argh Sina gion have lo makin Cpumeteroal gua renga est
& TB bag voamnSinen Aig fim hn as

Vik OB pened yt ym o Aion .. .

Lh wim Oui L pan weal? 2
BDC LE en a

CS hweng ome ef the Mie em om O g

hive he ew tel hes ose Karnes ete Danas 4
p hives Bly. a? at te ; -

bel Ke y.wen Sipue LO Aa ee

my te give MSRM nh haere Ae, Yo -
abs V2 yim Cake ha deore ober gov iQhanioan F
Fae Oe be my Phe ‘were iL 13 2 nde Soke . =

"Sonn a tbl hoa f bog AO _

“Pipher » Wah LL yk tobe

£9 OS pO 

In [3]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
   ---------------------------------------- 0.0/7.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/7.1 MB 660.6 kB/s eta 0:00:11
   - -------------------------------------- 0.2/7.1 MB 3.0 MB/s eta 0:00:03
   --- ------------------------------------ 0.7/7.1 MB 5.2 MB/s eta 0:00:02
   ------ --------------------------------- 1.1/7.1 MB 6.3 MB/s eta 0:00:01
   ------- -------------------------------- 1.4/7.1 MB 6.5 MB/s eta 0:00:01
   --------- ------------------------------ 1.7/7.1 MB 6.2 MB/s eta 0:00:01
   ---------- ----------------------------- 1.9/7.1 MB 6.5 MB/s eta 0:00:01
   ------------- -------------------------- 2.3/7.1 MB 6.8 MB/s eta 0:00:01
   --------------- ------------------------ 2.8/7.1 MB 7.1 MB/s eta 0:00:01
   ----------------- ---------------------- 3.1/7.1 MB 7.1 MB/s eta 0:00:01
   -----------

In [5]:
pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   -- ------------------------------------- 41.0/624.3 kB 1.9 MB/s eta 0:00:01
   -------------------- ------------------- 327.7/624.3 kB 5.0 MB/s eta 0:00:01
   ---------------------------------------- 624.3/624.3 kB 5.6 MB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------- ----------------------- 0.6/1.5 MB 19.5 MB/s eta 0:00:01
   ---------------------------- ----------- 1.1/1.5 MB 11.2 MB/s eta 0:00:01
   ---------------------------------------  1.5/1.5 MB 10.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 9.5 MB/s eta 0:00:00
Installing collected packages

In [13]:
#text correction and validation
import os
from textblob import TextBlob

# Load recognized text results from the file
recognized_text_file = "recognized_text_results.txt"
corrected_text_file = "corrected_text_results.txt"

if not os.path.exists(recognized_text_file):
    print(f"Error: '{recognized_text_file}' not found.")
else:
    with open(recognized_text_file, "r", encoding="utf-8") as file:
        recognized_text = file.read()

    # Split the text into sections per file
    sections = recognized_text.split('-' * 50)

    corrected_sections = []

    for section in sections:
        if section.strip():
            lines = section.strip().split('\n', 1)
            file_info = lines[0] if lines else "Unknown file"
            content = lines[1] if len(lines) > 1 else ""

            # Correct the content using TextBlob
            corrected_content = str(TextBlob(content).correct())

            # Store the corrected section
            corrected_sections.append(f"{file_info}\n{corrected_content}")

    # Save the corrected text to a file
    with open(corrected_text_file, "w", encoding="utf-8") as file:
        file.write("\n" + '-' * 50 + "\n".join(corrected_sections))

    print(f"✅ Corrected text saved to '{corrected_text_file}'")


✅ Corrected text saved to 'corrected_text_results.txt'


In [14]:
#standardized by removing puntuations and convereted to lowercase
import re
import os

corrected_text_file = "corrected_text_results.txt"
standardized_text_file = "standardized_text_results.txt"

def standardize_text(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep only alphanumeric characters and spaces
    return text.lower()

if not os.path.exists(corrected_text_file):
    print(f"Error: '{corrected_text_file}' not found.")
else:
    # Read and standardize the corrected text
    with open(corrected_text_file, "r", encoding="utf-8") as file:
        corrected_text = file.read()

    # Standardize text
    standardized_text = standardize_text(corrected_text)

    # Save the standardized text to a new file
    with open(standardized_text_file, "w", encoding="utf-8") as file:
        file.write(standardized_text)

    print(f"✅ Text has been standardized and saved to '{standardized_text_file}'")

✅ Text has been standardized and saved to 'standardized_text_results.txt'


In [17]:
import os

# Path to the cleaned dataset file
cleaned_dataset_file = "standardized_text_results.txt"

def display_cleaned_text(file_path):
    """Read and display the standardized dataset content."""
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            print("📝 Standardized Dataset Content:")
            print(content)  # Display entire content
    else:
        print(f"Error: '{file_path}' not found.")

# Display standardized dataset
display_cleaned_text(cleaned_dataset_file)

📝 Standardized Dataset Content:

recognized text in pic1jpg
or a d  1
i 27
ly 7  i
tt c  up 
 an  
  and re 9 no lie awe by 3 to es 5 
vneghay plm a of he sick long he
 a a as all men
is dog of see d 7 of 9 home and 
 ay and of is up til of a on we
tra see te
   gpesare recall ilyin lions 
by mob in he by la roth of go a a z
i see te te see   on  
recognized text in pic10jpg
or
0 rome sir  ate f
of daddy by 
z  a go relief no one ha
verser arch nina gin have lo making cpumeteroal gun ring est
 of bag voamnsinen fig him in as

in of opened it my o lion  

oh him qui l pan weal 2
bdc of en a

of when one of the lie em om o g

hive he we tell he one varies eye hands 4
p hives fly a at te  

be he ywen pipe of a he

my te give msrm no here he to 
as of him take ha dere ober go iqhanioan f
the he be my the were in 13 2 de joke  

on a til how f bog of 

either  ah of y tone

9 of pp was of

penn le cko add of g ha the the

prgrerhenn is apes 7  74 
recognized text in pic11jpg
by he one 7 an

In [19]:
#stop words removed and standardized
import re
import os
import json
import nltk
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Path to the standardized dataset file
cleaned_dataset_file = "standardized_text_results.txt"

def extract_data(file_path):
    """Extracts specific data patterns from the standardized text."""
    if not os.path.exists(file_path):
        print(f"Error: '{file_path}' not found.")
        return

    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Extract different types of data
    dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', content)  # Dates
    emails = re.findall(r'\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b', content)  # Email addresses
    phone_numbers = re.findall(r'\b\d{10}\b', content)  # 10-digit phone numbers
    numbers = re.findall(r'\b\d+\b', content)  # Any standalone number

    # Extract lowercase names and remove stop words
    names = re.findall(r'\b(?:[a-z]+\s?)+\b', content)
    filtered_names = [name for name in names if name.lower() not in stop_words]

    # Print extracted data
    print("📅 Extracted Dates:", dates)
    print("📧 Extracted Email Addresses:", emails)
    print("📱 Extracted Phone Numbers:", phone_numbers)
    print("🔢 Extracted Numbers:", numbers)
    print("👤 Extracted Names (without stop words):", filtered_names)

    # Save the extracted data
    extracted_data_file = "extracted_data.json"
    with open(extracted_data_file, "w", encoding="utf-8") as f:
        json.dump({
            "dates": dates,
            "emails": emails,
            "phone_numbers": phone_numbers,
            "numbers": numbers,
            "names": filtered_names
        }, f, ensure_ascii=False, indent=4)
    
    print(f"✅ Extracted data saved to {extracted_data_file}")

# Perform data extraction
extract_data(cleaned_dataset_file)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...


📅 Extracted Dates: []
📧 Extracted Email Addresses: []
📱 Extracted Phone Numbers: []
🔢 Extracted Numbers: ['1', '27', '7', '9', '3', '5', '7', '9', '0', '2', '4', '13', '2', '9', '7', '74', '7', '13', '4', '6', '5', '7', '2', '3', '1', '2', '4', '55', '2', '4', '4', '7', '8', '7', '68', '16', '7', '2', '6', '2', '9', '4', '4', '4', '7', '30', '2', '0', '5', '7', '4', '4', '3', '7', '1', '5', '20', '7', '0', '0', '1', '4', '4', '2', '1', '2', '4', '3', '1', '4', '1', '1', '2', '2', '2', '26', '2', '4', '7', '2', '2', '2', '5', '2', '8', '5', '2', '2', '645', '6', '7', '9', '4', '7', '8', '0', '666', '1', '34', '8', '0', '24', '2', '4', '0', '94', '4', '4', '3', '7', '2', '5', '7', '2', '7', '7', '7', '7', '5', '37', '4', '2', '4', '7', '7', '2', '2', '2', '2', '3', '1', '252', '2', '7', '7', '1', '7', '2', '8', '9', '1', '2', '3', '7', '3', '0', '9', '55', '2', '2', '2', '2', '7', '1', '5', '3', '07', '9', '324', '200', '9', '2', '7', '5', '5', '3', '7', '2', '4', '5', '2', '1', '0', '4'

[nltk_data]   Unzipping corpora\stopwords.zip.


In [27]:
#cleaned data set
import json

# Path to the cleaned JSON file
cleaned_data_file = "cleaned_extracted_data.json"

# Load and display the cleaned data
if not os.path.exists(cleaned_data_file):
    print(f"Error: '{cleaned_data_file}' not found.")
else:
    with open(cleaned_data_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Display the cleaned data
    print("✅ Cleaned Data Output:")
    for item in data:
        print(json.dumps(item, ensure_ascii=False, indent=4))


✅ Cleaned Data Output:


In [29]:
pip install nltk textblob

Note: you may need to restart the kernel to use updated packages.


In [31]:
#Data storage
import os
import json
import pytesseract
from PIL import Image

# Path to the folder containing images
output_folder = "preprocessed_data"  # Update this if needed

# Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def recognize_text(file):
    img_path = os.path.join(output_folder, file)
    try:
        img = Image.open(img_path)
        text = pytesseract.image_to_string(img, config="--psm 6")
        return text.strip() if text.strip() else "No text recognized"
    except Exception as e:
        return f"Error processing {file}: {str(e)}"

# Recognize text from all images
recognized_text_results = [
    {"filename": file, "text": recognize_text(file)} 
    for file in os.listdir(output_folder) if file.endswith(('.png', '.jpg', '.jpeg'))
]

# Save the results to a JSON file
json_file = "recognized_text_results.json"
with open(json_file, "w", encoding="utf-8") as file:
    json.dump(recognized_text_results, file, ensure_ascii=False, indent=4)

print(f"✅ Recognized text saved to '{json_file}'")

✅ Recognized text saved to 'recognized_text_results.json'


In [33]:
import json
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the recognized text from JSON file
input_file = "recognized_text_results.json"

if not os.path.exists(input_file):
    print(f"Error: '{input_file}' not found.")
else:
    with open(input_file, "r", encoding="utf-8") as file:
        recognized_data = json.load(file)

    # Preprocess and analyze text data
    stop_words = set(stopwords.words("english"))
    all_texts = []

    print("Text Analysis Results:\n")

    for entry in recognized_data:
        filename = entry['filename']
        text = entry['text']

        # Remove special characters and convert text to lowercase
        clean_text = re.sub(r'[^\w\s]', '', text.lower())
        all_texts.append(clean_text)

        # Keyword extraction
        words = [word for word in clean_text.split() if word not in stop_words]
        top_keywords = Counter(words).most_common(5)

        # Sentiment analysis
        sentiment = TextBlob(clean_text).sentiment

        # Print results for each file
        print(f"📄 File: {filename}")
        print(f"🧠 Top Keywords: {[keyword for keyword, count in top_keywords]}")
        print(f"😊 Sentiment (Polarity): {sentiment.polarity}, (Subjectivity): {sentiment.subjectivity}")
        print("-" * 50)

    # Overall frequency analysis
    all_words = [word for text in all_texts for word in text.split() if word not in stop_words]
    overall_frequency = Counter(all_words).most_common(10)

    print("\n🔍 Overall Keyword Frequency Analysis:")
    for word, count in overall_frequency:
        print(f"{word}: {count}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Text Analysis Results:

📄 File: pic1.jpg
🧠 Top Keywords: ['ae', 'te', '7', '9', 'ee']
😊 Sentiment (Polarity): 0.0, (Subjectivity): 0.0
--------------------------------------------------
📄 File: pic10.jpg
🧠 Top Keywords: ['f', 'ha', 'lo', '2', 'le']
😊 Sentiment (Polarity): 0.0, (Subjectivity): 0.0
--------------------------------------------------
📄 File: pic11.jpg
🧠 Top Keywords: ['ee', '4', '2', '7', 'ha']
😊 Sentiment (Polarity): 0.39285714285714285, (Subjectivity): 0.5178571428571428
--------------------------------------------------
📄 File: pic12.jpg
🧠 Top Keywords: ['ae', 'ee', '4', '7', 'oo']
😊 Sentiment (Polarity): -0.28181818181818175, (Subjectivity): 0.5833333333333333
--------------------------------------------------
📄 File: pic13.jpg
🧠 Top Keywords: ['4', '7', 'ae', 'ls', 'oe']
😊 Sentiment (Polarity): 0.3, (Subjectivity): 0.2
--------------------------------------------------
📄 File: pic14.jpg
🧠 Top Keywords: ['ee', 's90bl', 'om', '1', 'kc']
😊 Sentiment (Polarity): 0.0, (Sub

In [37]:
#text index and search
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
import json
import os
import shutil

# Paths
index_dir = "text_index"
input_file = "recognized_text_results.json"

# Define schema for indexing
schema = Schema(filename=ID(stored=True), content=TEXT)

# Remove existing index directory if present
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
    print("🗑️ Removed existing index directory.")
os.mkdir(index_dir)

# Create index
ix = create_in(index_dir, schema)

# Load recognized text data
if not os.path.exists(input_file):
    print(f"Error: '{input_file}' not found.")
else:
    with open(input_file, "r", encoding="utf-8") as file:
        recognized_data = json.load(file)

    # Index the recognized text
    writer = ix.writer()
    for entry in recognized_data:
        filename = entry.get('filename')
        text = entry.get('text', '')
        if text:
            writer.add_document(filename=filename, content=text)
    writer.commit()
    print("✅ Text indexed successfully!")

# Search Functionality
def search_term(term):
    """Search the indexed text for a given term."""
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(term)
        results = searcher.search(query)
        print(f"\n🔍 Search Results for '{term}':")
        if results:
            for result in results:
                print(f"- Found in file: {result['filename']}")
        else:
            print("⚠️ No results found.")

# Perform a search query
search_query = input("Enter a search term: ")
search_term(search_query)

🗑️ Removed existing index directory.
✅ Text indexed successfully!


Enter a search term:  ee



🔍 Search Results for 'ee':
- Found in file: pic2.jpg
- Found in file: pic21.jpg
- Found in file: pic14.jpg
- Found in file: pic23.jpg
- Found in file: pic7.jpg
- Found in file: pic35.jpg
- Found in file: pic30.jpg
- Found in file: pic4.jpg
- Found in file: pic26.jpg
- Found in file: pic33.jpg


In [42]:
#text recognition to improve accuracy for pdf
import os
import pytesseract
from PIL import Image

# Define output folder containing preprocessed images
output_folder = "processed_images"  # Update this path

# Path to Tesseract executable (update this according to your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def recognize_text(file):
    """Recognize text in a single preprocessed image."""
    img_path = os.path.join(output_folder, file)
    
    try:
        # Load the preprocessed image
        img = Image.open(img_path)
        
        # Use Tesseract to recognize text
        text = pytesseract.image_to_string(img, config="--psm 6")  # Page segmentation mode 6 for better accuracy
        
        return text.strip() if text.strip() else "No text recognized"
    except Exception as e:
        return f"Error processing {file}: {str(e)}"

if not os.path.exists(output_folder):
    print(f"Error: Output folder '{output_folder}' does not exist.")
else:
    # Recognize text in all preprocessed images
    recognized_text_results = {
        file: recognize_text(file) for file in os.listdir(output_folder) if file.endswith(('.png', '.jpg', '.jpeg'))
    }
    
    # Print recognized text for each image
    for file, text in recognized_text_results.items():
        print(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")
    
    # Optionally, save results to a file
    output_file = "recognized_text_resultss.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        for file, text in recognized_text_results.items():
            f.write(f"Recognized text in {file}:\n{text}\n{'-'*50}\n")
    
    print(f"✅ Text recognition completed. Results saved to {output_file}")

Recognized text in page_1.png:
THE
FOR THE

|YearofourLorp Gop 1762.

jPISSEXTILE, or Leap-Year,

q wrotrin ane convareap

Tor Lewarrons, Congerctions, Beetrsgs; the Sum and
Moce’t Ritng and Serz.ngy the Ring, Set ag and Soothing, of
the Heaveaty Booire; Weasatn; £2 Catcenaven)
AcceaDinG To AeT 5 10d refer t© the HORIZON of 48
Degrees Riv Lave, ind a Mendis of Five Hours Wf from
the Cay of Lenin, feting Singin, Merlard, NevéCoriua,|
ke malle a Take of Coaw-Davi ; Deferptin the Rowe
threegh the Conrixeas with a bib cf the COUNCIL and
Hocse of Beecevens of Virginie:

] To muscu 1s aot,

JA Colle on ef appersee Matis, enterusing Evtce ame, etcu
Asreporns, daertng Srovits, &¢, Ge. Se, Calened
Ieyrece Thom and Amt: eMknT.

——

By THEOPHILUS Wreg, Philm

——

Fer to pleafe Ail, we frive with all eur Might;
But kaze, ‘tis for your Money thar tue verite,

——————q—~_—_oe

i

| WILLIAMSBURG:

Printed and Sold by Joseru Roviz, and Ce

en |
--------------------------------------------------

Recogn

In [44]:
#text correction and validation for pdf
import os
from textblob import TextBlob

# Load recognized text results from the file
recognized_text_filee = "recognized_text_resultss.txt"
corrected_text_filee = "corrected_text_resultss.txt"

if not os.path.exists(recognized_text_filee):
    print(f"Error: '{recognized_text_filee}' not found.")
else:
    with open(recognized_text_filee, "r", encoding="utf-8") as file:
        recognized_textt = file.read()

    # Split the text into sections per file
    sections = recognized_textt.split('-' * 50)

    corrected_sections = []

    for section in sections:
        if section.strip():
            lines = section.strip().split('\n', 1)
            file_info = lines[0] if lines else "Unknown file"
            content = lines[1] if len(lines) > 1 else ""

            # Correct the content using TextBlob
            corrected_contentt = str(TextBlob(content).correct())

            # Store the corrected section
            corrected_sections.append(f"{file_info}\n{corrected_contentt}")

    # Save the corrected text to a file
    with open(corrected_text_filee, "w", encoding="utf-8") as file:
        file.write("\n" + '-' * 50 + "\n".join(corrected_sections))

    print(f"✅ Corrected text saved to '{corrected_text_filee}'")

✅ Corrected text saved to 'corrected_text_resultss.txt'


In [46]:
corrected_text_file = "corrected_text_resultss.txt"

if not os.path.exists(corrected_text_filee):
    print(f"Error: '{corrected_text_filee}' not found.")
else:
    # Read and display the corrected text file content
    with open(corrected_text_filee, "r", encoding="utf-8") as file:
        corrected_textt = file.read()
        
    print("📄 Corrected Text Output:\n")
    print(corrected_textt)

📄 Corrected Text Output:


--------------------------------------------------Recognized text in page_1.png:
THE
FOR THE

|YearofourLorp Top 1762.

jPISSEXTILE, or Heap-Dear,

q protein and convareap

For Lewarrons, Congerctions, Beetrsgs; the Sum and
One’t King and Her.ney the King, Met a and Soothing, of
the Heaveaty Moore; Peasant; £2 Catcenaven)
AcceaDinG To he 5 and refer t© the HORIZON of 48
Degrees Iv Have, ind a Ends of Give Hours Of from
the May of Vein, being Singing, Gerard, NevéCoriua,|
ke male a Take of Now-Have ; Deferptin the Owe
through the Conrixeas with a big cf the COUNCIL and
House of Beecevens of Virginia:

] To must is at,

of Molle on of appeased Paris, entering Vice ame, etc
Asreporns, daring Profits, &¢, He. He, Wakened
Ieyrece Whom and Mt: eMknT.

——

By THEOPHILUS Are, Him

——

Her to please Oil, we five with all our Right;
But gaze, ‘tis for your Money that the merite,

——————q—~_—toe

i

| WILLIAMSBURG:

Printed and Old by Joseru Viz, and He

en |
Recognized

In [50]:
#standardized by removing puntuations and convereted to lowercase for pdf
import re
import os

corrected_text_filee = "corrected_text_resultss.txt"
standardized_text_filee = "standardized_text_resultss.txt"

def standardize_textt(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep only alphanumeric characters and spaces
    return text.lower()

if not os.path.exists(corrected_text_filee):
    print(f"Error: '{corrected_text_filee}' not found.")
else:
    # Read and standardize the corrected text
    with open(corrected_text_filee, "r", encoding="utf-8") as file:
        corrected_textt = file.read()

    # Standardize text
    standardized_textt = standardize_textt(corrected_textt)

    # Save the standardized text to a new file
    with open(standardized_text_filee, "w", encoding="utf-8") as file:
        file.write(standardized_textt)

    print(f"✅ Text has been standardized and saved to '{standardized_text_filee}'")

✅ Text has been standardized and saved to 'standardized_text_resultss.txt'


In [52]:
import os

# Path to the cleaned dataset file
cleaned_dataset_filee = "standardized_text_resultss.txt"

def display_cleaned_textt(file_path):
    """Read and display the standardized dataset content."""
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            print("📝 Standardized Dataset Content:")
            print(content)  # Display entire content
    else:
        print(f"Error: '{file_path}' not found.")

# Display standardized dataset
display_cleaned_textt(cleaned_dataset_filee)

📝 Standardized Dataset Content:

recognized text in page1png
the
for the

yearofourlorp top 1762

jpissextile or heapdear

q protein and convareap

for lewarrons congerctions beetrsgs the sum and
onet king and herney the king met a and soothing of
the heaveaty moore peasant 2 catcenaven
acceading to he 5 and refer t the horizon of 48
degrees iv have ind a ends of give hours of from
the may of vein being singing gerard nevcoriua
ke male a take of nowhave  deferptin the owe
through the conrixeas with a big cf the council and
house of beecevens of virginia

 to must is at

of molle on of appeased paris entering vice ame etc
asreporns daring profits  he he wakened
ieyrece whom and mt emknt



by theophilus are him



her to please oil we five with all our right
but gaze tis for your money that the merite

qtoe

i

 williamsburg

printed and old by joseru viz and he

en 
recognized text in page10png
of 7
wet ve le
4 las a area a he
fvyreed each creamy i do of he of september yodaws
te knash

In [54]:
#stop words removed and standardized for pdf
import re
import os
import json
import nltk
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Path to the standardized dataset file
cleaned_dataset_filee = "standardized_text_resultss.txt"

def extract_data(file_path):
    """Extracts specific data patterns from the standardized text."""
    if not os.path.exists(file_path):
        print(f"Error: '{file_path}' not found.")
        return

    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Extract different types of data
    dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', content)  # Dates
    emails = re.findall(r'\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b', content)  # Email addresses
    phone_numbers = re.findall(r'\b\d{10}\b', content)  # 10-digit phone numbers
    numbers = re.findall(r'\b\d+\b', content)  # Any standalone number

    # Extract lowercase names and remove stop words
    names = re.findall(r'\b(?:[a-z]+\s?)+\b', content)
    filtered_names = [name for name in names if name.lower() not in stop_words]

    # Print extracted data
    print("📅 Extracted Dates:", dates)
    print("📧 Extracted Email Addresses:", emails)
    print("📱 Extracted Phone Numbers:", phone_numbers)
    print("🔢 Extracted Numbers:", numbers)
    print("👤 Extracted Names (without stop words):", filtered_names)

    # Save the extracted data
    extracted_data_filee = "extracted_dataa.json"
    with open(extracted_data_filee, "w", encoding="utf-8") as f:
        json.dump({
            "dates": dates,
            "emails": emails,
            "phone_numbers": phone_numbers,
            "numbers": numbers,
            "names": filtered_names
        }, f, ensure_ascii=False, indent=4)
    
    print(f"✅ Extracted data saved to {extracted_data_filee}")

# Perform data extraction
extract_data(cleaned_dataset_filee)

📅 Extracted Dates: []
📧 Extracted Email Addresses: []
📱 Extracted Phone Numbers: []
🔢 Extracted Numbers: ['1762', '2', '5', '48', '7', '4', '18', '19', '1720', '98', '3', '525334027', '0', '2535', '135', '753062', '736', '9', '19540', '629', '7524', '25416', '2', '22', '19', '853', '9', '16', '9', '26', '6', '9', '21', '65475', '7', '50', '6', '846', '1', '7', '9', '218', '8', '713', '7', '5', '7', '8', '7', '8', '57', '296215', '35', '9', '8704', '2539', '5', '58', '96', '6', '5542', '233', '7', '49', '2553', '22', '2816', '815', '52', '2', '4', '31', '5', '1159', '46', '7', '8', '8', '8', '2', '9', '2', '8', '8', '55', '12', '5', '1', '241302', '7', '130', '7', '135', '2', '9', '5', '24', '01', '2', '3', '2', '4', '3', '4', '30', '6', '3', '9', '7', '5', '319', '4', '1', '33', '9', '287', '1', '4', '570', '75', '29', '7', '855', '81', '1', '7', '8', '26', '4', '0', '8', '532', '58', '369', '2', '18', '48', '1915', '8', '45', '949', '3', '5', '3', '270', '2', '31', '24', '6', '168', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
#cleaned data set for pdf
import json

# Path to the cleaned JSON file
cleaned_data_filee = "cleaned_extracted_dataa.json"

# Load and display the cleaned data
if not os.path.exists(cleaned_data_filee):
    print(f"Error: '{cleaned_data_filee}' not found.")
else:
    with open(cleaned_data_filee, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Display the cleaned data
    print("✅ Cleaned Data Output:")
    for item in data:
        print(json.dumps(item, ensure_ascii=False, indent=4))


✅ Cleaned Data Output:


In [66]:
#Data storage for pdf
import os
import json
import pytesseract
from PIL import Image

# Path to the folder containing images
output_folder = "processed_images"  # Update this if needed

# Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def recognize_textt(file):
    img_path = os.path.join(output_folder, file)
    try:
        img = Image.open(img_path)
        text = pytesseract.image_to_string(img, config="--psm 6")
        return text.strip() if text.strip() else "No text recognized"
    except Exception as e:
        return f"Error processing {file}: {str(e)}"

# Recognize text from all images
recognized_text_resultss = [
    {"filename": file, "text": recognize_textt(file)} 
    for file in os.listdir(output_folder) if file.endswith(('.png', '.jpg', '.jpeg'))
]

# Save the results to a JSON file
json_file = "recognized_text_resultss.json"
with open(json_file, "w", encoding="utf-8") as file:
    json.dump(recognized_text_resultss, file, ensure_ascii=False, indent=4)

print(f"✅ Recognized text saved to '{json_file}'")

✅ Recognized text saved to 'recognized_text_resultss.json'


In [68]:
import json
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the recognized text from JSON file
input_file = "recognized_text_resultss.json"

if not os.path.exists(input_file):
    print(f"Error: '{input_file}' not found.")
else:
    with open(input_file, "r", encoding="utf-8") as file:
        recognized_dataa = json.load(file)

    # Preprocess and analyze text data
    stop_words = set(stopwords.words("english"))
    all_texts = []

    print("Text Analysis Results:\n")

    for entry in recognized_dataa:
        filename = entry['filename']
        text = entry['text']

        # Remove special characters and convert text to lowercase
        clean_text = re.sub(r'[^\w\s]', '', text.lower())
        all_texts.append(clean_text)

        # Keyword extraction
        words = [word for word in clean_text.split() if word not in stop_words]
        top_keywords = Counter(words).most_common(5)

        # Sentiment analysis
        sentiment = TextBlob(clean_text).sentiment

        # Print results for each file
        print(f"📄 File: {filename}")
        print(f"🧠 Top Keywords: {[keyword for keyword, count in top_keywords]}")
        print(f"😊 Sentiment (Polarity): {sentiment.polarity}, (Subjectivity): {sentiment.subjectivity}")
        print("-" * 50)

    # Overall frequency analysis
    all_words = [word for text in all_texts for word in text.split() if word not in stop_words]
    overall_frequency = Counter(all_words).most_common(10)

    print("\n🔍 Overall Keyword Frequency Analysis:")
    for word, count in overall_frequency:
        print(f"{word}: {count}")

Text Analysis Results:

📄 File: page_1.png
🧠 Top Keywords: ['yearofourlorp', 'gop', '1762', 'jpissextile', 'leapyear']
😊 Sentiment (Polarity): 0.0, (Subjectivity): 0.0
--------------------------------------------------
📄 File: page_10.png
🧠 Top Keywords: ['7', '9', 'ae', 'le', '6']
😊 Sentiment (Polarity): 0.13333333333333333, (Subjectivity): 0.3833333333333333
--------------------------------------------------
📄 File: page_11.png
🧠 Top Keywords: ['8', 'ae', '2', '5', 'te']
😊 Sentiment (Polarity): 0.078125, (Subjectivity): 0.28229166666666666
--------------------------------------------------
📄 File: page_12.png
🧠 Top Keywords: ['ee', 'te', 'ae', 'ia', '4']
😊 Sentiment (Polarity): 0.057449494949494966, (Subjectivity): 0.46047979797979793
--------------------------------------------------
📄 File: page_13.png
🧠 Top Keywords: ['ae', '8', 'tt', '1', '4']
😊 Sentiment (Polarity): -0.22200000000000003, (Subjectivity): 0.33599999999999997
--------------------------------------------------
📄 Fil

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [70]:
#text index and search for pdf
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
import json
import os
import shutil

# Paths
index_dir = "text_indexx"
input_file = "recognized_text_resultss.json"

# Define schema for indexing
schema = Schema(filename=ID(stored=True), content=TEXT)

# Remove existing index directory if present
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
    print("🗑️ Removed existing index directory.")
os.mkdir(index_dir)

# Create index
ix = create_in(index_dir, schema)

# Load recognized text data
if not os.path.exists(input_file):
    print(f"Error: '{input_file}' not found.")
else:
    with open(input_file, "r", encoding="utf-8") as file:
        recognized_dataa = json.load(file)

    # Index the recognized text
    writer = ix.writer()
    for entry in recognized_dataa:
        filename = entry.get('filename')
        text = entry.get('text', '')
        if text:
            writer.add_document(filename=filename, content=text)
    writer.commit()
    print("✅ Text indexed successfully!")

# Search Functionality
def search_term(term):
    """Search the indexed text for a given term."""
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(term)
        results = searcher.search(query)
        print(f"\n🔍 Search Results for '{term}':")
        if results:
            for result in results:
                print(f"- Found in file: {result['filename']}")
        else:
            print("⚠️ No results found.")

# Perform a search query
search_query = input("Enter a search term: ")
search_term(search_query)

✅ Text indexed successfully!


Enter a search term:  aa



🔍 Search Results for 'aa':
- Found in file: page_10.png
- Found in file: page_5.png
- Found in file: page_7.png
- Found in file: page_8.png
- Found in file: page_9.png
- Found in file: page_11.png
- Found in file: page_4.png


In [15]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
   ---------------------------------------- 0.0/97.3 kB ? eta -:--:--
   ---------------------------------------- 97.3/97.3 kB 2.8 MB/s eta 0:00:00
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
   ---------------------------------------- 0.0/6.3 MB ? eta -:--:--
   --- ------------------------------------ 0.6/6.3 MB 12.4 MB/s eta 0:00:01
   ------ -------------------

In [19]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [15]:
!pip uninstall nltk -y
!pip install nltk

Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [17]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SONALI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
#summarization
import os
import json
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# File paths
standardized_text_file = "standardized_text_results.txt"
output_summary_file = "text_data_summary.json"

if not os.path.exists(standardized_text_file):
    print(f"Error: '{standardized_text_file}' not found.")
else:
    # Read standardized text
    with open(standardized_text_file, "r", encoding="utf-8") as file:
        standardized_text = file.read()

    # Use LexRank summarizer
    parser = PlaintextParser.from_string(standardized_text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    
    # Generate summary with up to 5 sentences
    summary_sentences = summarizer(parser.document, 5)
    
    summary = " ".join(str(sentence) for sentence in summary_sentences)

    # Save summary to a JSON file
    summary_data = {"summary": summary}
    with open(output_summary_file, "w", encoding="utf-8") as f:
        json.dump(summary_data, f, ensure_ascii=False, indent=4)

    print(f"✅ Summary generated and saved to '{output_summary_file}'")

✅ Summary generated and saved to 'text_data_summary.json'


In [23]:
import json
import os

# Path to the summary file
summary_file = "text_data_summary.json"

if not os.path.exists(summary_file):
    print(f"Error: '{summary_file}' not found.")
else:
    # Load and display the summary
    with open(summary_file, "r", encoding="utf-8") as file:
        summary_data = json.load(file)
    
    print("✅ Summary Output:")
    print(summary_data.get("summary", "No summary found."))

✅ Summary Output:
prgrerhenn is apes 7  74 recognized text in pic11jpg by he one 7 ana i of s c prowl lcr molle dis relief 13 oh us right agnes army been 4 in scale steam room he vo to reform ha if wing lift 6 mason its ar a was a week of l dreamy king he of lufleem be hare long to 5 of 7 of a was ire to es c awe flt tom of aealgnad an ay eh l0c times f the feo am co of  long and he le wes of a bbe aes up see one of re rath due tics all an he of on may age len ok a a vein old of see oo i a 2  toe 3 or e i  colon 1 p 2  tip a he lips  te or you me signal feels to a lee ha aagiear of desrki sees hi vera join of by lo cmmrnring of the in ka a is  his levee he 4 vive d 55 he a er ak carry  of see he end low  one we hey lent get p 2 of aran of bed but gaa op ff 4 too see men one gap he cceleh in the new a is e hacepiee on recognized text in pic12jpg by en emnctreal ox of c12 a woolccliit of a tal he go see yon solo bog poets ve of have for new aml is the be heh 4 7 began cacareagel index of

In [49]:
!pip install spacy



In [51]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB 4.1 MB/s eta 0:00:04
     -- ------------------------------------- 0.9/12.8 MB 7.7 MB/s eta 0:00:02
     ---- ----------------------------------- 1.3/12.8 MB 8.2 MB/s eta 0:00:02
     ----- ---------------------------------- 1.7/12.8 MB 8.5 MB/s eta 0:00:02
     ------ --------------------------------- 2.2/12.8 MB 8.6 MB/s eta 0:00:02
     -------- ------------------------------- 2.6/12.8 MB 8.3 MB/s eta 0:00:02
     --------- ------------------------------ 3.0/12.8 MB 8.7 MB/s eta 0:00:02
     ---------- ----------------------------- 3.5/12.8 MB 8.5 MB/s eta 0:00:02
     ----------- ---------------------------- 3.

In [53]:
import spacy
import json
import os

# Path to the summary file
summary_file = "text_data_summary.json"

if not os.path.exists(summary_file):
    print(f"Error: '{summary_file}' not found.")
else:
    # Load and display the summary
    with open(summary_file, "r", encoding="utf-8") as file:
        summary_data = json.load(file)
    
    # Extract the summary text
    summary_text = summary_data.get("summary", "No summary found.")
    print("✅ Summary Output:")
    print(summary_text)

    # Load spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Process the summary text using spaCy
    doc = nlp(summary_text)

    # Tokenization and POS tagging
    print("\n✅ Tokenization and POS Tagging Output:")
    for token in doc:
        print(f"{token.text}: {token.pos_}")


✅ Summary Output:
prgrerhenn is apes 7  74 recognized text in pic11jpg by he one 7 ana i of s c prowl lcr molle dis relief 13 oh us right agnes army been 4 in scale steam room he vo to reform ha if wing lift 6 mason its ar a was a week of l dreamy king he of lufleem be hare long to 5 of 7 of a was ire to es c awe flt tom of aealgnad an ay eh l0c times f the feo am co of  long and he le wes of a bbe aes up see one of re rath due tics all an he of on may age len ok a a vein old of see oo i a 2  toe 3 or e i  colon 1 p 2  tip a he lips  te or you me signal feels to a lee ha aagiear of desrki sees hi vera join of by lo cmmrnring of the in ka a is  his levee he 4 vive d 55 he a er ak carry  of see he end low  one we hey lent get p 2 of aran of bed but gaa op ff 4 too see men one gap he cceleh in the new a is e hacepiee on recognized text in pic12jpg by en emnctreal ox of c12 a woolccliit of a tal he go see yon solo bog poets ve of have for new aml is the be heh 4 7 began cacareagel index of

In [55]:
!pip install transformers torch



In [57]:
import spacy
import json
import os
from transformers import BertTokenizer, BertModel
import torch

# Path to the summary file
summary_file = "text_data_summary.json"

if not os.path.exists(summary_file):
    print(f"Error: '{summary_file}' not found.")
else:
    # Load and display the summary
    with open(summary_file, "r", encoding="utf-8") as file:
        summary_data = json.load(file)
    
    # Extract the summary text
    summary_text = summary_data.get("summary", "No summary found.")
    print("✅ Summary Output:")
    print(summary_text)

    # Load spaCy English model for tokenization and POS tagging
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(summary_text)

    # Tokenization and POS tagging using spaCy
    print("\n✅ Tokenization and POS Tagging Output:")
    for token in doc:
        print(f"{token.text}: {token.pos_}")
    
    # Step 2: BERT Tokenization and Embedding

    # Load BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize the summary text for BERT
    inputs = tokenizer(summary_text, return_tensors='pt', truncation=True, padding=True)

    # Get BERT embeddings
    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model(**inputs)

    # Extract the last hidden states (word embeddings)
    embeddings = outputs.last_hidden_state

    print("\n✅ BERT Embeddings (First token for each word):")
    # Print embeddings for the first token of each word (the output tensor shape is [batch_size, sequence_length, hidden_size])
    for i, token in enumerate(inputs['input_ids'][0]):
        word_token = tokenizer.decode(token)
        embedding = embeddings[0][i].numpy()
        print(f"{word_token}: {embedding[:5]}...")  # Display only the first 5 values for brevity


✅ Summary Output:
prgrerhenn is apes 7  74 recognized text in pic11jpg by he one 7 ana i of s c prowl lcr molle dis relief 13 oh us right agnes army been 4 in scale steam room he vo to reform ha if wing lift 6 mason its ar a was a week of l dreamy king he of lufleem be hare long to 5 of 7 of a was ire to es c awe flt tom of aealgnad an ay eh l0c times f the feo am co of  long and he le wes of a bbe aes up see one of re rath due tics all an he of on may age len ok a a vein old of see oo i a 2  toe 3 or e i  colon 1 p 2  tip a he lips  te or you me signal feels to a lee ha aagiear of desrki sees hi vera join of by lo cmmrnring of the in ka a is  his levee he 4 vive d 55 he a er ak carry  of see he end low  one we hey lent get p 2 of aran of bed but gaa op ff 4 too see men one gap he cceleh in the new a is e hacepiee on recognized text in pic12jpg by en emnctreal ox of c12 a woolccliit of a tal he go see yon solo bog poets ve of have for new aml is the be heh 4 7 began cacareagel index of

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


✅ BERT Embeddings (First token for each word):
[CLS]: [-0.59055346  0.44584826  0.23336548  0.03874056 -0.3859214 ]...
pr: [-0.8605638  -0.27462593  0.8757959   0.11679095 -0.16488108]...
##gre: [-0.5523482   0.07749548  0.48694786 -0.27878964 -0.19432694]...
##rh: [-0.48949885  0.0519016   0.7445584  -0.04239527  0.12505782]...
##en: [-0.26625496 -0.13402943  0.42478386  0.01067807  0.40560657]...
##n: [-0.8948028  -0.53474563  0.3824216   0.05782855  0.5027791 ]...
is: [-0.49716994 -0.04961524  0.55651665 -0.18209375  0.5097695 ]...
apes: [ 0.11571751 -0.2973941   0.8246481  -0.56004465  0.1050477 ]...
7: [-0.21626994  0.12847307  0.70785356 -0.9996933  -0.3989703 ]...
74: [ 0.11835714  0.60431474  0.9526701  -0.41494328 -0.09073943]...
recognized: [-1.3617524   0.3537998  -0.30848625 -0.02180975 -0.178085  ]...
text: [-0.9110503   0.20749646  0.13154222 -0.10810283  0.24847355]...
in: [-2.1258218  -0.09307418  0.05780092 -0.21680571  0.4345539 ]...
pic: [-1.2818872   0.04610668  0.

In [67]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [71]:
pip install gleu

Collecting gleu
  Downloading gleu-1.1.0-py3-none-any.whl.metadata (439 bytes)
Collecting prettytable (from gleu)
  Downloading prettytable-3.14.0-py3-none-any.whl.metadata (30 kB)
Downloading gleu-1.1.0-py3-none-any.whl (9.1 kB)
Downloading prettytable-3.14.0-py3-none-any.whl (31 kB)
Installing collected packages: prettytable, gleu
Successfully installed gleu-1.1.0 prettytable-3.14.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from nltk.translate.gleu_score import sentence_gleu

# Reference and candidate sentences
reference = ["this", "is", "a", "test"]
candidate = ["this", "is", "test"]

# Calculate GLEU score
gleu_score_value = sentence_gleu([reference], candidate)
print(f"✅ GLEU Score: {gleu_score_value}")


✅ GLEU Score: 0.4


In [3]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Reference and candidate sentences
reference = [["this", "is", "a", "test"]]
candidate = ["this", "is", "test"]

# Smoothing function to prevent BLEU score from being zero
smoothing = SmoothingFunction().method4  # You can experiment with different methods

# Calculate BLEU score
bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothing)
print(f"✅ BLEU Score: {bleu_score}")


✅ BLEU Score: 0.16793545727180179


In [7]:
import spacy
import json
import os
from transformers import BertTokenizer, BertModel
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.gleu_score import sentence_gleu

# Path to the summary file
summary_file = "text_data_summary.json"

if not os.path.exists(summary_file):
    print(f"Error: '{summary_file}' not found.")
else:
    # Load and display the summary
    with open(summary_file, "r", encoding="utf-8") as file:
        summary_data = json.load(file)
    
    # Extract the summary text
    summary_text = summary_data.get("summary", "No summary found.")
    print("✅ Summary Output:")
    print(summary_text)

    # Load spaCy English model for tokenization and POS tagging
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(summary_text)

    # Tokenization and POS tagging using spaCy
    print("\n✅ Tokenization and POS Tagging Output:")
    for token in doc:
        print(f"{token.text}: {token.pos_}")
    
    # Step 2: BERT Tokenization and Embedding

    # Load BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize the summary text for BERT
    inputs = tokenizer(summary_text, return_tensors='pt', truncation=True, padding=True)

    # Get BERT embeddings
    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model(**inputs)

    # Extract the last hidden states (word embeddings)
    embeddings = outputs.last_hidden_state

    print("\n✅ BERT Embeddings (First token for each word):")
    # Print embeddings for the first token of each word (the output tensor shape is [batch_size, sequence_length, hidden_size])
    for i, token in enumerate(inputs['input_ids'][0]):
        word_token = tokenizer.decode(token)
        embedding = embeddings[0][i].numpy()
        print(f"{word_token}: {embedding[:5]}...")  # Display only the first 5 values for brevity
    
    # Step 3: Refining Summary using BLEU Score and GLEU Score

    # Example reference summary (gold standard)
    reference_summary = "This is a sample reference summary for comparison."  # Replace with the actual reference summary

    # BLEU score calculation
    reference = [reference_summary.split()]  # Tokenize the reference
    candidate = summary_text.split()  # Tokenize the generated summary
    
    # BLEU Score calculation (using unigram precision for simplicity)
    smoothing = SmoothingFunction().method4  # You can experiment with different methods
    bleu_score = corpus_bleu([reference], [candidate], smoothing_function=smoothing)
    print(f"\n✅ BLEU Score: {bleu_score}")
    
    # GLEU score calculation (using nltk.translate.gleu_score)
    gleu_score_value = sentence_gleu([reference], candidate)
    print(f"\n✅ GLEU Score: {gleu_score_value}")





✅ Summary Output:
prgrerhenn is apes 7  74 recognized text in pic11jpg by he one 7 ana i of s c prowl lcr molle dis relief 13 oh us right agnes army been 4 in scale steam room he vo to reform ha if wing lift 6 mason its ar a was a week of l dreamy king he of lufleem be hare long to 5 of 7 of a was ire to es c awe flt tom of aealgnad an ay eh l0c times f the feo am co of  long and he le wes of a bbe aes up see one of re rath due tics all an he of on may age len ok a a vein old of see oo i a 2  toe 3 or e i  colon 1 p 2  tip a he lips  te or you me signal feels to a lee ha aagiear of desrki sees hi vera join of by lo cmmrnring of the in ka a is  his levee he 4 vive d 55 he a er ak carry  of see he end low  one we hey lent get p 2 of aran of bed but gaa op ff 4 too see men one gap he cceleh in the new a is e hacepiee on recognized text in pic12jpg by en emnctreal ox of c12 a woolccliit of a tal he go see yon solo bog poets ve of have for new aml is the be heh 4 7 began cacareagel index of

TypeError: unhashable type: 'list'