## Installing Dependencies

In [None]:
!pip install aiohttp, paddlepaddle-gpu, paddleocr

## Importing modules

In [4]:
import nest_asyncio
import asyncio
import aiohttp
import cv2
import numpy as np
import pandas as pd

from paddleocr import PaddleOCR
from concurrent.futures import ThreadPoolExecutor

## Handling Event Loop

In [5]:
nest_asyncio.apply()

## Setting Up OCR

In [6]:
#Setup OCR model to run on GPU
ocr_model = PaddleOCR(lang='en', use_gpu=True)
# WIth CPU
#ocr_model = PaddleOCR(lang='en', use_gpu=False)

[2024/09/16 11:07:34] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shiva/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shiva/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

### Asynchronously fetch the image

In [7]:
async def fetch_image(image_url, session):
    try:
        async with session.get(image_url) as response:
            if response.status == 200:  # Check for a valid response
                img_array = np.asarray(bytearray(await response.read()), dtype="uint8")
                img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
                return img
            else:
                print(f"Failed to fetch image: {image_url}, status: {response.status}")
                return None
    except Exception as e:
        print(f"Error fetching image from {image_url}: {e}")
        return None   
    

## Image Resizing as fetched

In [8]:
# Function to run OCR on the fetched image and return extracted data
def process_image(img):
    try:
        # Resize image for faster processing
        img_resized = cv2.resize(img, (640, 640))  # Resizing image to 640x640
        result = ocr_model.ocr(img_resized)
        extracted_data = [result[0][i][1][0] for i in range(len(result[0]))]
        return extracted_data
    except Exception as e:
        return str(e)

## Parallel OCR

In [9]:
# Asynchronous function to fetch images in parallel and run OCR
async def process_images(data):
    async with aiohttp.ClientSession() as session:
        with ThreadPoolExecutor() as executor:
            loop = asyncio.get_event_loop()
            tasks = []
            for image_url in data['image_link']:
                img = await fetch_image(image_url, session)
                if img is not None:
                    tasks.append(loop.run_in_executor(executor, process_image, img))
                else:
                    # If fetching image failed, return an empty list or default value
                    tasks.append(loop.run_in_executor(executor, lambda: []))
            ocr_results = await asyncio.gather(*tasks)
            return ocr_results

## Process Batches and Extract Data

In [10]:
# Process in batches for large datasets
def process_in_batches(data, batch_size=5000):
    all_ocr_data = []
    num_batches = len(data) // batch_size + 1

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        end = min((batch_num + 1) * batch_size, len(data))
        batch_data = data.iloc[start:end]

        # Apply async processing to fetch and process images in batches
        ocr_results = asyncio.run(process_images(batch_data))

        # Ensure the length of ocr_results matches batch_data
        if len(ocr_results) < len(batch_data):
            # If results are fewer, pad with None
            ocr_results.extend([None] * (len(batch_data) - len(ocr_results)))
        elif len(ocr_results) > len(batch_data):
            # If more results (shouldn't happen), trim the list
            ocr_results = ocr_results[:len(batch_data)]

        # Save intermediate results to avoid data loss
        batch_data['ocr_text'] = ocr_results
        batch_data.to_csv(f"dataset/testDta/test{batch_num}.csv", index=False)

    return pd.concat(all_ocr_data)

## Loading Datasets 

In [11]:
# Load data and drop unnecessary columns
# TEST Data
data = pd.read_csv("dataset/sample_test.csv", index_col=False)
data = data.drop(['entity_name'], axis=1)


# Load data and drop unnecessary columns
# TRAIN Data
# data = pd.read_csv("/content/Dataset/net_train.csv", index_col=False)
# data = data.drop(['entity_name', 'entity_value'], axis=1)

## Data Sampling

In [12]:
# data = data.sample(n=57000)

## Reset Index Must

In [13]:
data.reset_index(drop=True, inplace=True)

In [14]:
data.head()

Unnamed: 0,index,image_link,group_id
0,1200,https://m.media-amazon.com/images/I/31porpKxyr...,478357
1,1201,https://m.media-amazon.com/images/I/31porpKxyr...,478357
2,1202,https://m.media-amazon.com/images/I/31pqPZnqje...,569206
3,1203,https://m.media-amazon.com/images/I/31pqoHDvL8...,442321
4,1204,https://m.media-amazon.com/images/I/31psy25ZB-...,276611


## Set Batch Size

In [24]:
# Adjust as per memory limits
batch_size = 5000  

## Call the threads 

In [None]:
# Define batch size and run processing
process_in_batches(data, batch_size)

In [56]:
# Save the final result
# processed_data.to_csv("dataset/TrainedOcr/train_with_ocr.csv", index=False)

## Step 6 Combining Data

In [17]:
import pandas as pd

# Your list of URLs
listUN = [f"dataset/testDta/test{i}.csv" for i in range(6)]

# Read all CSV files into a list of DataFrames
dfs = [pd.read_csv(url) for url in listUN]

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Optional: Save the combined DataFrame to a new CSV file
combined_df.to_csv('test_ocr1.csv', index=False)