## System Configs

In [None]:
# !pip install easyocr
!pip install surya-ocr

In [None]:
import pandas as pd
import numpy as np
import os
import requests
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO
import shutil
import random
#import easyocr
import cv2

from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import requests

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/AmazonML/student_resource/src/sanity.py /content
!cp /content/drive/MyDrive/AmazonML/student_resource/src/utils.py /content
!cp /content/drive/MyDrive/AmazonML/student_resource/dataset/sample_test.csv /content
!cp /content/drive/MyDrive/AmazonML/student_resource/dataset/sample_test_out.csv /content
!cp /content/drive/MyDrive/AmazonML/student_resource/dataset/sample_test_out_fail.csv /content
!cp /content/drive/MyDrive/AmazonML/student_resource/dataset/test.csv /content
!cp /content/drive/MyDrive/AmazonML/student_resource/dataset/train.csv /content
!cp /content/drive/MyDrive/AmazonML/student_resource/src/constants.py /content

In [None]:
train_csv_path = '/content/train.csv'
test_csv_path = '/content/test.csv'
sample_test_csv_path = '/content/sample_test.csv'

In [None]:
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
sample_test = pd.read_csv(sample_test_csv_path)

print("Training Data Head:")
print(train_df.head())


Training Data Head:
                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value  
0      500.0 gram  
1         1.0 cup  
2      0.709 gram  
3      0.709 gram  
4  1400 milligram  


In [None]:
num_distinct = train_df['group_id'].nunique()

In [None]:
num_distinct

750

In [None]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

## RUN SANITY CHECK FOR INITAL AND OUTPUT

In [None]:
!python sanity.py --test_filename ../content/sample_test.csv --output_filename ../content/sample_test_out.csv

Parsing successfull for file: ../content/sample_test_out.csv


In [None]:
!python /content/sanity.py --test_filename ../content/sample_test.csv --output_filename ../content/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'inch', 'centimetre', 'millivolt', 'pint', 'foot', 'pound', 'cubic inch', 'millilitre', 'ton', 'microgram', 'centilitre', 'metre', 'imperial gallon', 'gram', 'yard', 'milligram', 'kilovolt', 'kilowatt', 'litre', 'fluid ounce', 'gallon', 'cubic foot', 'volt', 'quart', 'decilitre', 'ounce', 'cup', 'kilogram', 'microlitre', 'millimetre', 'watt'}


## Process Images

## OCR


In [None]:
# Load OCR model
langs = ["en"]
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

In [None]:
def get_data_from_ocr(url: str) -> list:
  from urllib.request import urlopen
  response = dict()
  captured_strings = []
  try:
      image = Image.open(urlopen(url))
      predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
      txt_lines = predictions[0].text_lines
      for texts in txt_lines:
          captured_strings.append(texts.text)
      response[url] = captured_strings
      print(response)
  except Exception as e:
      print(f"Error processing image at {url}: {e}")
      return []

  return response[url]
def process_with_periodic_save(df: pd.DataFrame, save_interval: int, save_path: str, start_row: int = 0, end_row: int = None):
    if end_row is None:
        end_row = len(df)  # If no end_row is specified, process until the end of the DataFrame

    # Loop through the rows from start_row to end_row
    for idx, row in df.iloc[start_row:end_row].iterrows():
        actual_idx = idx + start_row  # Adjust for actual row index

        # Apply OCR function to each row's image link
        df.at[actual_idx, 'ocr_data'] = get_data_from_ocr(row['image_link'])

        # Save after every 'save_interval' rows
        if (actual_idx + 1) % save_interval == 0:
            print(f"Saving progress at row {actual_idx + 1}...")
            df.to_csv(save_path, index=False)

    # Save the final DataFrame after the loop ends
    df.to_csv(save_path, index=False)
    print("Final save completed.")




In [None]:
# Set starting and ending rows and save path
train_df['ocr_data'] = None  # Add a column to store OCR results if not already present
save_path = 'ocr_data.csv'
start_row = 47552
end_row = 62000

# Start processing from row 47552 to row 62000
process_with_periodic_save(train_df, save_interval=50, save_path=save_path, start_row=start_row, end_row=end_row)

In [None]:
# # Initialize the reader
# reader = easyocr.Reader(['all'])

# # Perform OCR
# result = reader.readtext('test.jpg')
# print(result)

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


[([[622, 450], [872, 450], [872, 498], [622, 498]], "Horbaach'", 0.8040294705884506), ([[537, 608], [674, 608], [674, 656], [537, 656]], 'HIGH', 0.46724188327789307), ([[691, 608], [968, 608], [968, 656], [691, 656]], 'StRENGTH', 0.6227695422664076), ([[431, 655], [1090, 655], [1090, 800], [431, 800]], 'PSYLLIUM', 0.9996113747751084), ([[547, 789], [959, 789], [959, 921], [547, 921]], 'HUSK', 0.9882693290710449), ([[787, 961], [1049, 961], [1049, 999], [787, 999]], 'PLANTAGO OVATA', 0.9986013159171112), ([[458, 958], [710, 958], [710, 1036], [458, 1036]], '1400mG', 0.4404229749914589), ([[787, 1003], [985, 1003], [985, 1039], [787, 1039]], 'PLANT SEEDS', 0.9631550742130158), ([[476, 1230], [560, 1230], [560, 1260], [476, 1260]], 'FOOD', 0.7478782731537303), ([[911, 1224], [1041, 1224], [1041, 1266], [911, 1266]], 'VEGAN', 0.999770431891911), ([[475, 1263], [661, 1263], [661, 1299], [475, 1299]], 'SUPPLEMENT', 0.996968049128284), ([[760, 1222], [910, 1222], [910, 1300], [760, 1300]], '3

In [None]:
# def get_data_from_ocr(url: str) -> list:
#   from urllib.request import urlopen
#   response = dict()
#   captured_strings = []
#   try:
#       image = Image.open(urlopen(url))
#       result = reader.readtext('test.jpg')
#       captured_strings = [text[1] for text in result]
#       response[url] = captured_strings
#       print(response)
#   except Exception as e:
#       print(f"Error processing image at {url}: {e}")
#       return []
#   torch.cuda.empty_cache()
#   return response[url]

In [None]:
# def process_with_periodic_save(df: pd.DataFrame, save_interval: int, save_path: str):
#     for idx, row in df.iterrows():
#         # Apply OCR function to each row's image link
#         df.at[idx, 'ocr_data'] = get_data_from_ocr(row['image_link'])

#         # Save after every 'save_interval' rows
#         if (idx + 1) % save_interval == 0:
#             print(f"Saving progress at row {idx + 1}...")
#             df.to_csv(save_path, index=False)

#     # Save the final DataFrame after the loop ends
#     df.to_csv(save_path, index=False)
#     print("Final save completed.")

In [None]:
# # Usage example:
# train_df['ocr_data'] = None  # Add a column to store OCR results if not already present
# save_path = 'ocr_data.csv'
# process_with_periodic_save(train_df, save_interval=50, save_path=save_path)

{'https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg': ["Horbaach'", 'HIGH', 'StRENGTH', 'PSYLLIUM', 'HUSK', 'PLANTAGO OVATA', '1400mG', 'PLANT SEEDS', 'FOOD', 'VEGAN', 'SUPPLEMENT', '365', 'CAPSULES']}


KeyboardInterrupt: 

In [None]:
# print(train_df.head())

                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value                                           ocr_data  
0      500.0 gram  [INGRÉDIENT MÉNAGER,  MULTI-USAGE, TERRE DE , ...  
1         1.0 cup  [LEBENSMITTELECHT, GEPRÄGTES , DESIGN, Designe...  
2      0.709 gram  {'https://m.media-amazon.com/images/I/61BZ4zrj...  
3      0.709 gram  {'https://m.media-amazon.com/images/I/612mrlqi...  
4  1400 milligram  {'https://m.media-amazon.com/images/I/617Tl40L...  


# Training

## BERT-base NER
