In [None]:
import pandas as pd
import re

# Load the CSV file
data = pd.read_csv("../../../datasets/train.csv")

# Filter to only include item_weight entity
item_weight_data = data[data['entity_name'] == 'height']

# Extract numeric value and unit from entity_value
def split_value(value):
    value = value.strip()
    parts = value.strip().split(' ')
    if len(parts) == 2:
        try: 
            float(parts[0])
        except:
            return None, None
        return float(parts[0]), parts[1]
    return None, None

item_weight_data[['numeric_value', 'unit']] = item_weight_data['entity_value'].apply(split_value).apply(pd.Series)

# Drop rows with missing values
item_weight_data.dropna(inplace=True)

# Save processed data to a new CSV file
item_weight_data.to_csv('updates.csv', index=True)

print("Data preprocessing complete. Processed data saved to 'processed_item_weight_data.csv'.")

In [2]:
from utils_height import download_images
download_images(item_weight_data['image_link'].tolist(),  item_weight_data.index.tolist(), "dataset")

100%|██████████| 43597/43597 [00:01<00:00, 32549.49it/s]


In [7]:
import os
import logging
import pandas as pd
from paddleocr import PaddleOCR
import easyocr
from tqdm import tqdm

# logging.getLogger('ppocr').setLevel(logging.ERROR)
logging.getLogger('easyocr').setLevel(logging.ERROR)

image_folder = 'dataset/'  # Adjust this to the correct folder containing your images
# ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Initialize PaddleOCR
reader = easyocr.Reader(['en'], gpu=True)  # Initialize EasyOCR

def extract_ocr_text(index):
    image_path = os.path.join(image_folder, f"{index}.jpg")
    if not os.path.exists(image_path):
        return index, ""
    
    try:
        result = reader.readtext(image_path)
        ocr_text = " ".join([text for _, text, _ in result])
    except Exception as e:
        print(f"Error processing image {index}: {e}")
        ocr_text = ""
    
    return index, ocr_text

In [8]:
# Assuming `data` is your DataFrame
data = pd.read_csv("updates.csv")
indices = data['id'].to_list()
# Process images in smaller batches to avoid memory issues
batch_size = 10  # Adjust this based on your memory constraints
ocr_results = []

for i in tqdm(range(0, len(indices), batch_size), desc="Extracting OCR"):
    batch_indices = indices[i:i + batch_size]
    batch_indices = batch_indices[::2]
    for index in batch_indices:
        batch_results = []
        batch_results.append(extract_ocr_text(index))
        ocr_results.extend(batch_results)
    

# Add the OCR results to the DataFrame
ocr_text_dict = dict(ocr_results)
data['ocr_text'] = data['id'].map(ocr_text_dict)
data = data[data['ocr_text'].notna() & (data['ocr_text'] != "")]
# Save the updated DataFrame with the OCR text to a new CSV
updated_csv_file = 'height_with_ocr.csv'
data.to_csv(updated_csv_file, index=True)

print(f"OCR data has been added to the CSV. Updated file saved as '{updated_csv_file}'.")

Extracting OCR:  18%|█▊        | 771/4360 [16:32<1:05:57,  1.10s/it]Premature end of JPEG file


Error processing image 227978: image file is truncated (4 bytes not processed)


Extracting OCR:  85%|████████▌ | 3721/4360 [1:23:35<13:09,  1.24s/it]Premature end of JPEG file


Error processing image 257474: image file is truncated (0 bytes not processed)


Extracting OCR:  86%|████████▌ | 3736/4360 [1:23:54<13:50,  1.33s/it]Premature end of JPEG file


Error processing image 257624: image file is truncated (1 bytes not processed)


Extracting OCR:  86%|████████▌ | 3748/4360 [1:24:08<12:25,  1.22s/it]Premature end of JPEG file


Error processing image 257742: image file is truncated (4 bytes not processed)


Extracting OCR:  86%|████████▌ | 3750/4360 [1:24:10<11:59,  1.18s/it]Premature end of JPEG file
Extracting OCR:  86%|████████▌ | 3751/4360 [1:24:11<10:41,  1.05s/it]

Error processing image 257770: image file is truncated (5 bytes not processed)


Premature end of JPEG file


Error processing image 257776: image file is truncated (3 bytes not processed)


Extracting OCR:  86%|████████▌ | 3752/4360 [1:24:12<10:10,  1.00s/it]

Error processing image 257788: Could not find a backend to open `dataset/257788.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]


Extracting OCR:  86%|████████▌ | 3753/4360 [1:24:13<10:03,  1.01it/s]Premature end of JPEG file


Error processing image 257798: image file is truncated (6 bytes not processed)


Extracting OCR:  86%|████████▌ | 3756/4360 [1:24:16<10:02,  1.00it/s]

Error processing image 257820: Could not find a backend to open `dataset/257820.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]


Extracting OCR:  86%|████████▌ | 3757/4360 [1:24:17<11:55,  1.19s/it]Premature end of JPEG file
Extracting OCR:  86%|████████▌ | 3758/4360 [1:24:19<11:54,  1.19s/it]

Error processing image 257840: image file is truncated (22 bytes not processed)


Premature end of JPEG file
Extracting OCR:  86%|████████▌ | 3759/4360 [1:24:20<11:29,  1.15s/it]

Error processing image 257850: image file is truncated (3 bytes not processed)


Extracting OCR:  86%|████████▌ | 3760/4360 [1:24:21<10:48,  1.08s/it]Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file


Error processing image 257862: image file is truncated (0 bytes not processed)
Error processing image 257864: image file is truncated (4 bytes not processed)
Error processing image 257866: image file is truncated (43 bytes not processed)
Error processing image 257868: image file is truncated (4 bytes not processed)


Extracting OCR:  86%|████████▋ | 3761/4360 [1:24:21<09:01,  1.11it/s]Premature end of JPEG file


Error processing image 257876: image file is truncated (0 bytes not processed)


Extracting OCR:  86%|████████▋ | 3762/4360 [1:24:22<09:39,  1.03it/s]

Error processing image 257882: Could not find a backend to open `dataset/257882.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]


Extracting OCR:  88%|████████▊ | 3825/4360 [1:25:44<11:21,  1.27s/it]Premature end of JPEG file


Error processing image 258516: image file is truncated (5 bytes not processed)


Extracting OCR:  88%|████████▊ | 3833/4360 [1:25:55<12:45,  1.45s/it]Premature end of JPEG file
Extracting OCR:  88%|████████▊ | 3834/4360 [1:25:55<10:19,  1.18s/it]

Error processing image 258600: image file is truncated (17 bytes not processed)


Extracting OCR:  88%|████████▊ | 3839/4360 [1:26:03<12:45,  1.47s/it]Premature end of JPEG file


Error processing image 258654: image file is truncated (2 bytes not processed)


Extracting OCR:  88%|████████▊ | 3841/4360 [1:26:05<10:51,  1.26s/it]Premature end of JPEG file


Error processing image 258676: image file is truncated (27 bytes not processed)


Extracting OCR:  88%|████████▊ | 3845/4360 [1:26:09<09:13,  1.08s/it]Premature end of JPEG file


Error processing image 258712: image file is truncated (0 bytes not processed)


Extracting OCR:  89%|████████▊ | 3859/4360 [1:26:28<10:56,  1.31s/it]

Error processing image 258852: Could not find a backend to open `dataset/258852.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]


Premature end of JPEG file


Error processing image 258858: image file is truncated (5 bytes not processed)


Extracting OCR:  89%|████████▊ | 3860/4360 [1:26:29<09:04,  1.09s/it]Premature end of JPEG file


Error processing image 258866: image file is truncated (8 bytes not processed)


Extracting OCR:  89%|████████▊ | 3862/4360 [1:26:31<08:57,  1.08s/it]

Error processing image 258880: Could not find a backend to open `dataset/258880.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]
Error processing image 258882: Could not find a backend to open `dataset/258882.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]


Premature end of JPEG file
Extracting OCR:  89%|████████▊ | 3863/4360 [1:26:32<08:15,  1.00it/s]Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file


Error processing image 258890: image file is truncated (38 bytes not processed)
Error processing image 258892: image file is truncated (5 bytes not processed)
Error processing image 258894: image file is truncated (8 bytes not processed)
Error processing image 258896: image file is truncated (12 bytes not processed)
Error processing image 258898: image file is truncated (3 bytes not processed)


Extracting OCR:  89%|████████▊ | 3866/4360 [1:26:35<07:49,  1.05it/s]

Error processing image 258918: Could not find a backend to open `dataset/258918.jpg`` with iomode `r`.
Based on the extension, the following plugins might add capable backends:
  pyav:  pip install imageio[pyav]


Extracting OCR:  89%|████████▊ | 3867/4360 [1:26:37<10:07,  1.23s/it]Premature end of JPEG file


Error processing image 258936: image file is truncated (0 bytes not processed)


Premature end of JPEG file
Extracting OCR:  89%|████████▊ | 3868/4360 [1:26:37<09:08,  1.11s/it]

Error processing image 258940: image file is truncated (13 bytes not processed)


Extracting OCR: 100%|██████████| 4360/4360 [1:37:18<00:00,  1.34s/it]

OCR data has been added to the CSV. Updated file saved as 'height_with_ocr.csv'.



