diff --git a/README.md b/README.md index 2174bf47..d9058936 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase. pip install scrapegraphai[more-browser-options] ``` +### Installing "More Browser Options" + +This group includes an ocr scraper for websites +```bash +pip install scrapegraphai[screenshot_scraper] +``` + ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/examples/extras/Savedscreenshots/test_image.jpeg b/examples/extras/Savedscreenshots/test_image.jpeg new file mode 100644 index 00000000..159625bc Binary files /dev/null and b/examples/extras/Savedscreenshots/test_image.jpeg differ diff --git a/examples/extras/screenshot_scaping.py b/examples/extras/screenshot_scaping.py new file mode 100644 index 00000000..439c2a0c --- /dev/null +++ b/examples/extras/screenshot_scaping.py @@ -0,0 +1,32 @@ +""" +example of scraping with screenshots +""" +import asyncio +from scrapegraphai.utils.screenshot_scraping import (take_screenshot, + select_area_with_opencv, + crop_image, detect_text) + +# STEP 1: Take a screenshot +image = asyncio.run(take_screenshot( + url="https://colab.google/", + save_path="Savedscreenshots/test_image.jpeg", + quality = 50 +)) + +# STEP 2 (Optional): Select an area of the image which you want to use for text detection. +LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image) +print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM) + +# STEP 3 (Optional): Crop the image. +# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, +# it will be set to the corresponding edge of the image. +cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM) + +# STEP 4: Detect text +TEXT = detect_text( + cropped_image, # The image to detect text from + languages = ["en"] # The languages to detect text in +) + +print("DETECTED TEXT: ") +print(TEXT) diff --git a/pyproject.toml b/pyproject.toml index 47720cbf..8c321d1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,13 @@ more-browser-options = [ "browserbase>=0.3.0", ] +# Group 4: Surya Library +screenshot_scraper = [ + "surya-ocr>=0.5.0; python_version >= '3.10'", + "matplotlib>=3.7.2; python_version >= '3.10'", + "ipywidgets>=8.1.0; python_version >= '3.10'" +] + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/requirements-dev.lock b/requirements-dev.lock index b816db3d..3fc9c8f9 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -159,10 +159,6 @@ idna==3.7 # via yarl imagesize==1.4.1 # via sphinx -importlib-metadata==8.2.0 - # via sphinx -importlib-resources==6.4.0 - # via matplotlib iniconfig==2.0.0 # via pytest isort==5.13.2 @@ -447,10 +443,8 @@ typing-extensions==4.12.2 # via pydantic # via pydantic-core # via pyee - # via pylint # via sf-hamilton # via sqlalchemy - # via starlette # via streamlit # via typing-inspect # via uvicorn @@ -470,6 +464,3 @@ uvicorn==0.30.5 # via burr yarl==1.9.4 # via aiohttp -zipp==3.20.0 - # via importlib-metadata - # via importlib-resources diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 0a1b8d5b..86fa0840 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,4 +11,6 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md +from .screenshot_scraping.screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image +from .screenshot_scraping.text_detection import detect_text from .token_calculator import * diff --git a/scrapegraphai/utils/screenshot_scraping/__init__.py b/scrapegraphai/utils/screenshot_scraping/__init__.py new file mode 100644 index 00000000..20cfb3c0 --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/__init__.py @@ -0,0 +1,2 @@ +from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image +from .text_detection import detect_text diff --git a/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py new file mode 100644 index 00000000..6205449c --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py @@ -0,0 +1,219 @@ +""" +screenshot_preparation module +""" +import asyncio +from io import BytesIO +from PIL import Image, ImageGrab +from playwright.async_api import async_playwright +import cv2 as cv +import numpy as np +from io import BytesIO + +async def take_screenshot(url: str, save_path: str = None, quality: int = 100): + """ + Takes a screenshot of a webpage at the specified URL and saves it if the save_path is specified. + Parameters: + url (str): The URL of the webpage to take a screenshot of. + save_path (str): The path to save the screenshot to. Defaults to None. + quality (int): The quality of the jpeg image, between 1 and 100. Defaults to 100. + Returns: + PIL.Image: The screenshot of the webpage as a PIL Image object. + """ + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + await page.goto(url) + image_bytes = await page.screenshot(path=save_path, + type="jpeg", + full_page=True, + quality=quality) + await browser.close() + return Image.open(BytesIO(image_bytes)) + +def select_area_with_opencv(image): + """ + Allows you to manually select an image area using OpenCV. + It is recommended to use this function if your project is on your computer, + otherwise use select_area_with_ipywidget(). + Parameters: + image (PIL.Image): The image from which to select an area. + Returns: + A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area. + """ + + fullscreen_screenshot = ImageGrab.grab() + dw, dh = fullscreen_screenshot.size + + def draw_selection_rectanlge(event, x, y, flags, param): + global ix, iy, drawing, overlay, img + if event == cv.EVENT_LBUTTONDOWN: + drawing = True + ix, iy = x, y + elif event == cv.EVENT_MOUSEMOVE: + if drawing == True: + cv.rectangle(img, (ix, iy), (x, y), (41, 215, 162), -1) + cv.putText(img, 'PRESS ANY KEY TO SELECT THIS AREA', (ix, + iy-10), cv.FONT_HERSHEY_SIMPLEX, 1.5, (55, 46, 252), 5) + img = cv.addWeighted(overlay, alpha, img, 1 - alpha, 0) + elif event == cv.EVENT_LBUTTONUP: + global LEFT, TOP, RIGHT, BOTTOM + + drawing = False + if ix < x: + LEFT = int(ix) + RIGHT = int(x) + else: + LEFT = int(x) + RIGHT = int(ix) + if iy < y: + TOP = int(iy) + BOTTOM = int(y) + else: + TOP = int(y) + BOTTOM = int(iy) + + global drawing, ix, iy, overlay, img + drawing = False + ix, iy = -1, -1 + + img = np.array(image) + img = cv.cvtColor(img, cv.COLOR_RGB2BGR) + + img = cv.rectangle( + img, (0, 0), (image.size[0], image.size[1]), (0, 0, 255), 10) + img = cv.putText(img, 'SELECT AN AREA', (int( + image.size[0]*0.3), 100), cv.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 5) + + overlay = img.copy() + alpha = 0.3 + + while True: + cv.namedWindow('SELECT AREA', cv.WINDOW_KEEPRATIO) + cv.setMouseCallback('SELECT AREA', draw_selection_rectanlge) + cv.resizeWindow('SELECT AREA', int( + image.size[0]/(image.size[1]/dh)), dh) + + cv.imshow('SELECT AREA', img) + + if cv.waitKey(20) > -1: + break + + cv.destroyAllWindows() + return LEFT, TOP, RIGHT, BOTTOM + + +def select_area_with_ipywidget(image): + """ + Allows you to manually select an image area using ipywidgets. + It is recommended to use this function if your project is in Google Colab, + Kaggle or other similar platform, otherwise use select_area_with_opencv(). + Parameters: + image (PIL Image): The input image. + Returns: + None + """ + + import matplotlib.pyplot as plt + import numpy as np + from ipywidgets import interact, IntSlider + import ipywidgets as widgets + from PIL import Image + + img_array = np.array(image) + + print(img_array.shape) + + def update_plot(top_bottom, left_right, image_size): + plt.figure(figsize=(image_size, image_size)) + plt.imshow(img_array) + plt.axvline(x=left_right[0], color='blue', linewidth=1) + plt.text(left_right[0]+1, -25, 'LEFT', rotation=90, color='blue') + plt.axvline(x=left_right[1], color='red', linewidth=1) + plt.text(left_right[1]+1, -25, 'RIGHT', rotation=90, color='red') + + plt.axhline(y=img_array.shape[0] - + top_bottom[0], color='green', linewidth=1) + plt.text(-100, img_array.shape[0] - + top_bottom[0]+1, 'BOTTOM', color='green') + plt.axhline(y=img_array.shape[0]-top_bottom[1], + color='darkorange', linewidth=1) + plt.text(-100, img_array.shape[0] - + top_bottom[1]+1, 'TOP', color='darkorange') + plt.axis('off') + plt.show() + + top_bottom_slider = widgets.IntRangeSlider( + value=[int(img_array.shape[0]*0.25), int(img_array.shape[0]*0.75)], + min=0, + max=img_array.shape[0], + step=1, + description='top_bottom:', + disabled=False, + continuous_update=True, + orientation='vertical', + readout=True, + readout_format='d', + ) + + left_right_slider = widgets.IntRangeSlider( + value=[int(img_array.shape[1]*0.25), int(img_array.shape[1]*0.75)], + min=0, + max=img_array.shape[1], + step=1, + description='left_right:', + disabled=False, + continuous_update=True, + orientation='horizontal', + readout=True, + readout_format='d', + ) + image_size_bt = widgets.BoundedIntText( + value=10, + min=2, + max=20, + step=1, + description='Image size:', + disabled=False + ) + + interact(update_plot, top_bottom=top_bottom_slider, + left_right=left_right_slider, image_size=image_size_bt) + + return left_right_slider, top_bottom_slider + + +def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: str = None): + """ + Crop an image using the specified coordinates. + Parameters: + image (PIL.Image): The image to be cropped. + LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None. + TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None. + RIGHT (int, optional): The x-coordinate of + the right edge of the crop area. Defaults to None. + BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None. + save_path (str, optional): The path to save the cropped image. Defaults to None. + Returns: + PIL.Image: The cropped image. + Notes: + If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, + it will be set to the corresponding edge of the image. + If save_path is specified, the cropped image will be saved as a JPEG file at the specified path. + """ + + if LEFT is None: + LEFT = 0 + if TOP is None: + TOP = 0 + if RIGHT is None: + RIGHT = image.size[0] + if BOTTOM is None: + BOTTOM = image.size[1] + + croped_image = image.crop((LEFT, TOP, RIGHT, BOTTOM)) + if save_path is not None: + from pathlib import Path + croped_image.save(save_path, "JPEG") + + return image.crop((LEFT, TOP, RIGHT, BOTTOM)) diff --git a/scrapegraphai/utils/screenshot_scraping/text_detection.py b/scrapegraphai/utils/screenshot_scraping/text_detection.py new file mode 100644 index 00000000..8c33671f --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/text_detection.py @@ -0,0 +1,29 @@ +""" +text_detection_module +""" +from surya.ocr import run_ocr +from surya.model.detection.model import (load_model as load_det_model, + load_processor as load_det_processor) +from surya.model.recognition.model import load_model as load_rec_model +from surya.model.recognition.processor import load_processor as load_rec_processor + + +def detect_text(image, languages: list = ["en"]): + """ + Detects and extracts text from a given image. + Parameters: + image (PIL Image): The input image to extract text from. + lahguages (list): A list of languages to detect text in. Defaults to ["en"]. List of languages can be found here: https://github.com/VikParuchuri/surya/blob/master/surya/languages.py + Returns: + str: The extracted text from the image. + Notes: + Model weights will automatically download the first time you run this function. + """ + + langs = languages + det_processor, det_model = load_det_processor(), load_det_model() + rec_model, rec_processor = load_rec_model(), load_rec_processor() + predictions = run_ocr([image], [langs], det_model, + det_processor, rec_model, rec_processor) + text = "\n".join([line.text for line in predictions[0].text_lines]) + return text \ No newline at end of file