In [8]:
import pandas as pd

In [9]:
# Example of reading the dataset from a CSV file
data = pd.read_csv('sample_data/images_train.csv')

# Preview the data to ensure it has been loaded correctly
data.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [10]:
data.shape

(263859, 4)

In [1]:
# Install PaddlePaddle (with GPU support)
!pip install paddlepaddle-gpu==2.5.0.post117 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html

# Install PaddleOCR
!pip install paddleocr

# Install additional required libraries
!pip install pillow requests

Looking in links: https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
Collecting paddlepaddle-gpu==2.5.0.post117
  Downloading https://paddle-wheel.bj.bcebos.com/2.5.0/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.0.post117-cp310-cp310-linux_x86_64.whl (546.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m546.7/546.7 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from paddlepaddle-gpu==2.5.0.post117)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle-gpu==2.5.0.post117)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting paddle-bfloat==0.1.7 (from paddlepaddle-gpu==2.5.0.post117)
  Downloading paddle_bfloat-0.1.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (229 bytes)
Collecting httpcore==1.* (from httpx->paddlepaddle-gpu==2.5.0.post117)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11

In [2]:
from paddleocr import PaddleOCR
import requests
from PIL import Image
from io import BytesIO
import numpy as np

In [3]:
# Initialize PaddleOCR once
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Set language to English

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:15<00:00, 263kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:18<00:00, 561kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:14<00:00, 147kiB/s]

[2024/09/21 06:00:40] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_ch




In [12]:
def extract_text_from_image(image_url):
    """
    Extracts text from an image using PaddleOCR with error handling.

    Args:
        image_url (str): URL of the image to perform OCR on.

    Returns:
        str: Full extracted text from the image, or an error message.
    """
    try:
        # Load the image from the URL
        response = requests.get(image_url)
        response.raise_for_status()  # Check if the request was successful
        img = Image.open(BytesIO(response.content))

        # Convert the image to a numpy array
        img_np = np.array(img)

        # Perform OCR on the image
        result = ocr.ocr(img_np)

        # Check if result is empty or None
        if result and result[0]:
            # Extract text from the result
            extracted_text = ' '.join([line[-1][0] for line in result[0]])
            return extracted_text
        else:
            return "No text found in image."

    except requests.exceptions.RequestException as e:
        return f"Error loading image: {str(e)}"
    except TypeError as e:
        return f"Type error during processing: {str(e)}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [None]:
text = extract_text_from_image("https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg")
print(text)

[2024/09/21 05:41:23] ppocr DEBUG: dt_boxes num : 37, elapsed : 0.0653538703918457
[2024/09/21 05:41:24] ppocr DEBUG: cls num  : 37, elapsed : 0.046353816986083984
[2024/09/21 05:41:24] ppocr DEBUG: rec_res num  : 37, elapsed : 0.21457219123840332
Horbaach Directions: For adults, take two (2) vegan capsules daily, preferably with a meal. Do not exceed stated dose. Nutrition Information Typically Per Daily Dose HIGHSTRENGTH 1400mg Psyllium Husk Powder PSYLLIOM Ingredients: Psyllium Husk Powder, Capsule Shell (Hy-  droxypropylmethylcellulose), Anti-Caking Agents (Mag.  nesium Salts of Fatty Acids, Silicon Dioxide). HUSK May contain Sesame Seeds & Mustard. For allergens see the ingredients in bold. Notice: Take this product with 220ml of fluids. Taking this product without adequate fluid may cause the pos- 1400MG sibility of choking. Do not use this product if you have PLANTAGO OVATA difficulty swallowing. If you experience chest pain, vom- PLANT SEEDS iting or difficulty in swallowing or

In [None]:
text = extract_text_from_image("https://m.media-amazon.com/images/I/81N73b5khVL.jpg")
print(text)

[2024/09/21 05:41:34] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.04267096519470215
[2024/09/21 05:41:34] ppocr DEBUG: cls num  : 7, elapsed : 0.20307064056396484
[2024/09/21 05:41:34] ppocr DEBUG: rec_res num  : 7, elapsed : 0.08803772926330566
Groe Kapazitat, Tragfahigkeit bis zu 30KG. Das Material ist ca.5mm dick und die Fugen sind verstarkt, so dass es sehr stark ist Tragfahigkeit bis zu 30KG, kann eine Vielzahl von Gegenstanden 30 KG aufnehmen.


In [6]:
print(extract_text_from_image("https://m.media-amazon.com/images/I/817vo3DcCNL.jpg"))

[2024/09/21 06:01:46] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.07841634750366211
[2024/09/21 06:01:46] ppocr DEBUG: cls num  : 12, elapsed : 0.10532736778259277
[2024/09/21 06:01:46] ppocr DEBUG: rec_res num  : 12, elapsed : 0.16677308082580566
KOMFORT-PAKET DAS HERZSTUCK : STARKER UND EFFIZIENTER MOTOR FUR ANGENEHMEN FAHRKOMFORT KRAFTVOLLE UND EFFIZIENTE EXTREM LEISERUND UNGLAUBLICHES DREHMOMENT UNTERSTUTZUNG BIS LEISTUNGSSTARKERMOTOR MIT VON 25 KM/H 250 W 45 NM


In [5]:
text = extract_text_from_image("https://m.media-amazon.com/images/I/81e2YtCOKvL.jpg")
text

[2024/09/21 06:01:27] ppocr DEBUG: dt_boxes num : 39, elapsed : 1.4923243522644043
[2024/09/21 06:01:27] ppocr DEBUG: cls num  : 39, elapsed : 0.15944933891296387
[2024/09/21 06:01:27] ppocr DEBUG: rec_res num  : 39, elapsed : 0.23135137557983398


"FREE Glucon-D Regular 200gpack Glucon-D Net Weight Instant Energy Ikg avetray hanyfGnD 99.4% pure Glucose-Glucose is an Instant energy source for the body and is the only energy source for the brain. Vitamin Glucon-D has Vitamin D and Calcium that provides strength to the bones and body. energy production and storage. So experience the Instant Energy'of Glucon-D with your family!    D i  n%6   facifta 3 ffer ast.tffaia Offer available in specifically marked packs only We would love to hear from you MFD. M.R.PRs./(INCL.OFALLTAXES) OTNo (See space below /bottom of jar.)"

In [7]:
text = extract_text_from_image("https://m.media-amazon.com/images/I/915JHkwtcrL.jpg")
print(text)

[2024/09/21 06:02:12] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.06304693222045898
[2024/09/21 06:02:13] ppocr DEBUG: cls num  : 9, elapsed : 0.11662483215332031
[2024/09/21 06:02:13] ppocr DEBUG: rec_res num  : 9, elapsed : 0.03548574447631836
51*70 IN 31 0Z CARDEN soft ventilate warm and Particularly smooth Comfortable to skin


In [13]:
import pandas as pd

# Global counter
counter = 0

def extract_text_with_counter(image_url):
    global counter
    counter += 1  # Increment the counter for each row processed
    print(f"Processing image {counter}...")  # Print progress

    # Call the original extract_text_from_image function
    return extract_text_from_image(image_url)

# Load dataset
data = pd.read_csv('sample_data/images_train.csv')

# Limit the data to the first 10,000 samples
data_subset = data.head(10000)

# Apply the function on the image links (only on the first 10,000)
data_subset['extracted_text'] = data_subset['image_link'].apply(extract_text_with_counter)

# Show the result of the first few samples
print(data_subset.head())

# Save the result to a CSV file (optional)
data_subset.to_csv('output_with_extracted_text_10000.csv', index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2024/09/21 06:52:26] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.038574934005737305
[2024/09/21 06:52:26] ppocr DEBUG: cls num  : 6, elapsed : 0.008194446563720703
[2024/09/21 06:52:26] ppocr DEBUG: rec_res num  : 6, elapsed : 0.025491714477539062
Processing image 8755...
[2024/09/21 06:52:27] ppocr DEBUG: dt_boxes num : 34, elapsed : 0.051104068756103516
[2024/09/21 06:52:27] ppocr DEBUG: cls num  : 34, elapsed : 0.03873586654663086
[2024/09/21 06:52:27] ppocr DEBUG: rec_res num  : 34, elapsed : 0.07476353645324707
Processing image 8756...
[2024/09/21 06:52:27] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.029519319534301758
[2024/09/21 06:52:27] ppocr DEBUG: cls num  : 9, elapsed : 0.015623331069946289
[2024/09/21 06:52:27] ppocr DEBUG: rec_res num  : 9, elapsed : 0.05468487739562988
Processing image 8757...
[2024/09/21 06:52:27] ppocr DEBUG: dt_boxes num : 82, elapsed : 0.05475425720214844
[2024/09/21 06:52:27] ppocr DEBU