Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase.
pip install scrapegraphai[more-browser-options]
```

### Installing "More Browser Options"

This group includes an ocr scraper for websites
```bash
pip install scrapegraphai[screenshot_scraper]
```

## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).

Expand Down
Binary file added examples/extras/Savedscreenshots/test_image.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
32 changes: 32 additions & 0 deletions examples/extras/screenshot_scaping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
example of scraping with screenshots
"""
import asyncio
from scrapegraphai.utils.screenshot_scraping import (take_screenshot,
select_area_with_opencv,
crop_image, detect_text)

# STEP 1: Take a screenshot
image = asyncio.run(take_screenshot(
url="https://colab.google/",
save_path="Savedscreenshots/test_image.jpeg",
quality = 50
))

# STEP 2 (Optional): Select an area of the image which you want to use for text detection.
LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image)
print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)

# STEP 3 (Optional): Crop the image.
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
# it will be set to the corresponding edge of the image.
cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)

# STEP 4: Detect text
TEXT = detect_text(
cropped_image, # The image to detect text from
languages = ["en"] # The languages to detect text in
)

print("DETECTED TEXT: ")
print(TEXT)
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ more-browser-options = [
"browserbase>=0.3.0",
]

# Group 4: Surya Library
screenshot_scraper = [
"surya-ocr>=0.5.0; python_version >= '3.10'",
"matplotlib>=3.7.2; python_version >= '3.10'",
"ipywidgets>=8.1.0; python_version >= '3.10'"
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
Expand Down
9 changes: 0 additions & 9 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,6 @@ idna==3.7
# via yarl
imagesize==1.4.1
# via sphinx
importlib-metadata==8.2.0
# via sphinx
importlib-resources==6.4.0
# via matplotlib
iniconfig==2.0.0
# via pytest
isort==5.13.2
Expand Down Expand Up @@ -447,10 +443,8 @@ typing-extensions==4.12.2
# via pydantic
# via pydantic-core
# via pyee
# via pylint
# via sf-hamilton
# via sqlalchemy
# via starlette
# via streamlit
# via typing-inspect
# via uvicorn
Expand All @@ -470,6 +464,3 @@ uvicorn==0.30.5
# via burr
yarl==1.9.4
# via aiohttp
zipp==3.20.0
# via importlib-metadata
# via importlib-resources
2 changes: 2 additions & 0 deletions scrapegraphai/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@
from .cleanup_html import cleanup_html
from .logging import *
from .convert_to_md import convert_to_md
from .screenshot_scraping.screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
from .screenshot_scraping.text_detection import detect_text
from .token_calculator import *
2 changes: 2 additions & 0 deletions scrapegraphai/utils/screenshot_scraping/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
from .text_detection import detect_text
219 changes: 219 additions & 0 deletions scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
"""
screenshot_preparation module
"""
import asyncio
from io import BytesIO
from PIL import Image, ImageGrab
from playwright.async_api import async_playwright
import cv2 as cv
import numpy as np
from io import BytesIO

async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
"""
Takes a screenshot of a webpage at the specified URL and saves it if the save_path is specified.
Parameters:
url (str): The URL of the webpage to take a screenshot of.
save_path (str): The path to save the screenshot to. Defaults to None.
quality (int): The quality of the jpeg image, between 1 and 100. Defaults to 100.
Returns:
PIL.Image: The screenshot of the webpage as a PIL Image object.
"""

async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url)
image_bytes = await page.screenshot(path=save_path,
type="jpeg",
full_page=True,
quality=quality)
await browser.close()
return Image.open(BytesIO(image_bytes))

def select_area_with_opencv(image):
"""
Allows you to manually select an image area using OpenCV.
It is recommended to use this function if your project is on your computer,
otherwise use select_area_with_ipywidget().
Parameters:
image (PIL.Image): The image from which to select an area.
Returns:
A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area.
"""

fullscreen_screenshot = ImageGrab.grab()
dw, dh = fullscreen_screenshot.size

def draw_selection_rectanlge(event, x, y, flags, param):
global ix, iy, drawing, overlay, img
if event == cv.EVENT_LBUTTONDOWN:
drawing = True
ix, iy = x, y
elif event == cv.EVENT_MOUSEMOVE:
if drawing == True:
cv.rectangle(img, (ix, iy), (x, y), (41, 215, 162), -1)
cv.putText(img, 'PRESS ANY KEY TO SELECT THIS AREA', (ix,
iy-10), cv.FONT_HERSHEY_SIMPLEX, 1.5, (55, 46, 252), 5)
img = cv.addWeighted(overlay, alpha, img, 1 - alpha, 0)
elif event == cv.EVENT_LBUTTONUP:
global LEFT, TOP, RIGHT, BOTTOM

drawing = False
if ix < x:
LEFT = int(ix)
RIGHT = int(x)
else:
LEFT = int(x)
RIGHT = int(ix)
if iy < y:
TOP = int(iy)
BOTTOM = int(y)
else:
TOP = int(y)
BOTTOM = int(iy)

global drawing, ix, iy, overlay, img
drawing = False
ix, iy = -1, -1

img = np.array(image)
img = cv.cvtColor(img, cv.COLOR_RGB2BGR)

img = cv.rectangle(
img, (0, 0), (image.size[0], image.size[1]), (0, 0, 255), 10)
img = cv.putText(img, 'SELECT AN AREA', (int(
image.size[0]*0.3), 100), cv.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 5)

overlay = img.copy()
alpha = 0.3

while True:
cv.namedWindow('SELECT AREA', cv.WINDOW_KEEPRATIO)
cv.setMouseCallback('SELECT AREA', draw_selection_rectanlge)
cv.resizeWindow('SELECT AREA', int(
image.size[0]/(image.size[1]/dh)), dh)

cv.imshow('SELECT AREA', img)

if cv.waitKey(20) > -1:
break

cv.destroyAllWindows()
return LEFT, TOP, RIGHT, BOTTOM


def select_area_with_ipywidget(image):
"""
Allows you to manually select an image area using ipywidgets.
It is recommended to use this function if your project is in Google Colab,
Kaggle or other similar platform, otherwise use select_area_with_opencv().
Parameters:
image (PIL Image): The input image.
Returns:
None
"""

import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import interact, IntSlider
import ipywidgets as widgets
from PIL import Image

img_array = np.array(image)

print(img_array.shape)

def update_plot(top_bottom, left_right, image_size):
plt.figure(figsize=(image_size, image_size))
plt.imshow(img_array)
plt.axvline(x=left_right[0], color='blue', linewidth=1)
plt.text(left_right[0]+1, -25, 'LEFT', rotation=90, color='blue')
plt.axvline(x=left_right[1], color='red', linewidth=1)
plt.text(left_right[1]+1, -25, 'RIGHT', rotation=90, color='red')

plt.axhline(y=img_array.shape[0] -
top_bottom[0], color='green', linewidth=1)
plt.text(-100, img_array.shape[0] -
top_bottom[0]+1, 'BOTTOM', color='green')
plt.axhline(y=img_array.shape[0]-top_bottom[1],
color='darkorange', linewidth=1)
plt.text(-100, img_array.shape[0] -
top_bottom[1]+1, 'TOP', color='darkorange')
plt.axis('off')
plt.show()

top_bottom_slider = widgets.IntRangeSlider(
value=[int(img_array.shape[0]*0.25), int(img_array.shape[0]*0.75)],
min=0,
max=img_array.shape[0],
step=1,
description='top_bottom:',
disabled=False,
continuous_update=True,
orientation='vertical',
readout=True,
readout_format='d',
)

left_right_slider = widgets.IntRangeSlider(
value=[int(img_array.shape[1]*0.25), int(img_array.shape[1]*0.75)],
min=0,
max=img_array.shape[1],
step=1,
description='left_right:',
disabled=False,
continuous_update=True,
orientation='horizontal',
readout=True,
readout_format='d',
)
image_size_bt = widgets.BoundedIntText(
value=10,
min=2,
max=20,
step=1,
description='Image size:',
disabled=False
)

interact(update_plot, top_bottom=top_bottom_slider,
left_right=left_right_slider, image_size=image_size_bt)

return left_right_slider, top_bottom_slider


def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: str = None):
"""
Crop an image using the specified coordinates.
Parameters:
image (PIL.Image): The image to be cropped.
LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None.
TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None.
RIGHT (int, optional): The x-coordinate of
the right edge of the crop area. Defaults to None.
BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None.
save_path (str, optional): The path to save the cropped image. Defaults to None.
Returns:
PIL.Image: The cropped image.
Notes:
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
it will be set to the corresponding edge of the image.
If save_path is specified, the cropped image will be saved as a JPEG file at the specified path.
"""

if LEFT is None:
LEFT = 0
if TOP is None:
TOP = 0
if RIGHT is None:
RIGHT = image.size[0]
if BOTTOM is None:
BOTTOM = image.size[1]

croped_image = image.crop((LEFT, TOP, RIGHT, BOTTOM))
if save_path is not None:
from pathlib import Path
croped_image.save(save_path, "JPEG")

return image.crop((LEFT, TOP, RIGHT, BOTTOM))
29 changes: 29 additions & 0 deletions scrapegraphai/utils/screenshot_scraping/text_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
text_detection_module
"""
from surya.ocr import run_ocr
from surya.model.detection.model import (load_model as load_det_model,
load_processor as load_det_processor)
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor


def detect_text(image, languages: list = ["en"]):
"""
Detects and extracts text from a given image.
Parameters:
image (PIL Image): The input image to extract text from.
lahguages (list): A list of languages to detect text in. Defaults to ["en"]. List of languages can be found here: https://github.com/VikParuchuri/surya/blob/master/surya/languages.py
Returns:
str: The extracted text from the image.
Notes:
Model weights will automatically download the first time you run this function.
"""

langs = languages
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()
predictions = run_ocr([image], [langs], det_model,
det_processor, rec_model, rec_processor)
text = "\n".join([line.text for line in predictions[0].text_lines])
return text